# Device Failure

## Goal

Build a predictive model to predict whether a device will fail or not based on given data.

In [1]:
import pandas as pd
import numpy as np
import pandas_profiling

from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

In [2]:
df = pd.read_csv('failures.csv')

In [3]:
df.head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
0,2015-01-01,S1F01085,0,215630672,56,0,52,6,407438,0,0,7
1,2015-01-01,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,2015-01-01,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,2015-01-01,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,2015-01-01,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


In [4]:
#check current correlations 
df.corr()

Unnamed: 0,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
failure,1.0,0.001984,0.052902,-0.000948,0.067398,0.00227,-0.00055,0.119055,0.119055,0.001622
attribute1,0.001984,1.0,-0.004248,0.003702,0.001837,-0.00337,-0.001516,0.000151,0.000151,0.001122
attribute2,0.052902,-0.004248,1.0,-0.002617,0.146593,-0.013999,-0.02635,0.141367,0.141367,-0.002736
attribute3,-0.000948,0.003702,-0.002617,1.0,0.097452,-0.006696,0.009027,-0.001884,-0.001884,0.532366
attribute4,0.067398,0.001837,0.146593,0.097452,1.0,-0.009773,0.02487,0.045631,0.045631,0.036069
attribute5,0.00227,-0.00337,-0.013999,-0.006696,-0.009773,1.0,-0.017051,-0.009384,-0.009384,0.005949
attribute6,-0.00055,-0.001516,-0.02635,0.009027,0.02487,-0.017051,1.0,-0.012207,-0.012207,0.021152
attribute7,0.119055,0.000151,0.141367,-0.001884,0.045631,-0.009384,-0.012207,1.0,1.0,0.006861
attribute8,0.119055,0.000151,0.141367,-0.001884,0.045631,-0.009384,-0.012207,1.0,1.0,0.006861
attribute9,0.001622,0.001122,-0.002736,0.532366,0.036069,0.005949,0.021152,0.006861,0.006861,1.0


In [5]:
df['failure'].value_counts()

0    124388
1       106
Name: failure, dtype: int64

In [6]:
len(df['device'].unique())

1168

In [7]:
#pandas profiling
#df.profile_report(style={'full_width':True})

### Group by Device

In [8]:
#sort by device, reordered by latest date
df_sorted = df.sort_values(by = ['device', 'date'], ascending = False )

In [9]:
#keep first obvservation by device name, drop everything else, and reset index
df_sorted.drop_duplicates(subset = 'device', keep='first', inplace = True)

In [10]:
#reset index 
df_sorted.reset_index(drop = True)
df_sorted.head(10)

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
67274,2015-04-06,Z1F2PBHX,0,130522432,0,0,0,5,164020,0,0,0
61309,2015-03-25,Z1F282ZV,0,72717600,0,1,0,1,192919,0,0,0
61308,2015-03-25,Z1F26YZB,0,229065488,0,1,0,1,292052,0,0,0
98175,2015-06-30,Z1F1VQFY,1,178051496,0,0,0,7,265316,0,0,0
124003,2015-10-19,Z1F1VMZB,0,96949360,0,0,0,5,263507,0,0,0
77811,2015-05-04,Z1F1RJFA,1,123190616,62296,1,9,4,336709,0,0,0
6859,2015-01-06,Z1F1RE71,0,77380208,0,1,0,3,185854,0,0,0
117380,2015-09-02,Z1F1R76A,0,65052704,0,0,0,8,369270,0,0,12
60035,2015-03-23,Z1F1Q9BD,0,154629376,0,0,0,7,255446,0,0,0
6856,2015-01-06,Z1F1HSWK,0,120187320,0,0,0,5,362008,0,0,6


In [11]:
df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 67274 to 5812
Data columns (total 12 columns):
date          1168 non-null object
device        1168 non-null object
failure       1168 non-null int64
attribute1    1168 non-null int64
attribute2    1168 non-null int64
attribute3    1168 non-null int64
attribute4    1168 non-null int64
attribute5    1168 non-null int64
attribute6    1168 non-null int64
attribute7    1168 non-null int64
attribute8    1168 non-null int64
attribute9    1168 non-null int64
dtypes: int64(10), object(2)
memory usage: 118.6+ KB


In [12]:
df_sorted['failure'].value_counts()

0    1067
1     101
Name: failure, dtype: int64

In [13]:
df_sorted.corr()

Unnamed: 0,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
failure,1.0,0.018729,0.171187,-0.011361,0.160769,0.084073,-0.042033,0.194289,0.194289,-0.011368
attribute1,0.018729,1.0,-0.030013,0.056346,0.009994,0.004548,-0.0558,0.059128,0.059128,-0.017018
attribute2,0.171187,-0.030013,1.0,-0.003276,0.34973,-0.005972,-0.026274,0.086757,0.086757,-0.00589
attribute3,-0.011361,0.056346,-0.003276,1.0,0.189068,-0.023516,0.025655,-0.003929,-0.003929,0.447703
attribute4,0.160769,0.009994,0.34973,0.189068,1.0,-0.00676,0.001997,0.059203,0.059203,0.078266
attribute5,0.084073,0.004548,-0.005972,-0.023516,-0.00676,1.0,-0.013275,-0.001246,-0.001246,-0.028121
attribute6,-0.042033,-0.0558,-0.026274,0.025655,0.001997,-0.013275,1.0,-0.067541,-0.067541,0.01983
attribute7,0.194289,0.059128,0.086757,-0.003929,0.059203,-0.001246,-0.067541,1.0,1.0,0.016103
attribute8,0.194289,0.059128,0.086757,-0.003929,0.059203,-0.001246,-0.067541,1.0,1.0,0.016103
attribute9,-0.011368,-0.017018,-0.00589,0.447703,0.078266,-0.028121,0.01983,0.016103,0.016103,1.0


## Preprocessing Structured Data

In [14]:
df_sorted['attribute8'].equals(df_sorted['attribute7'])

True

In [15]:
#drop attribute8 column because it is a duplicate of attribute7
df_sorted =df_sorted.drop(columns = ['attribute8'])

In [16]:
#make classification and continous variables
cat_columns = ['attribute3','attribute5','attribute7','attribute9']
df_sorted[cat_columns]= df_sorted[cat_columns].astype(str)


In [17]:
df.corr()

Unnamed: 0,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
failure,1.0,0.001984,0.052902,-0.000948,0.067398,0.00227,-0.00055,0.119055,0.119055,0.001622
attribute1,0.001984,1.0,-0.004248,0.003702,0.001837,-0.00337,-0.001516,0.000151,0.000151,0.001122
attribute2,0.052902,-0.004248,1.0,-0.002617,0.146593,-0.013999,-0.02635,0.141367,0.141367,-0.002736
attribute3,-0.000948,0.003702,-0.002617,1.0,0.097452,-0.006696,0.009027,-0.001884,-0.001884,0.532366
attribute4,0.067398,0.001837,0.146593,0.097452,1.0,-0.009773,0.02487,0.045631,0.045631,0.036069
attribute5,0.00227,-0.00337,-0.013999,-0.006696,-0.009773,1.0,-0.017051,-0.009384,-0.009384,0.005949
attribute6,-0.00055,-0.001516,-0.02635,0.009027,0.02487,-0.017051,1.0,-0.012207,-0.012207,0.021152
attribute7,0.119055,0.000151,0.141367,-0.001884,0.045631,-0.009384,-0.012207,1.0,1.0,0.006861
attribute8,0.119055,0.000151,0.141367,-0.001884,0.045631,-0.009384,-0.012207,1.0,1.0,0.006861
attribute9,0.001622,0.001122,-0.002736,0.532366,0.036069,0.005949,0.021152,0.006861,0.006861,1.0


Numerical
1, 2, 4, 6 
Categorical
3, 5, 7 ,9
Deleted
8

1 is probably in milliseconds of runtime. 
2 number of 0's is significantly lower 94.9% - 54.7%, there is a skewness 
3 attribute 3's max value is significantly lower, from 24929 to 300, could mean hierarchy/label
4 number of 0's significantly lower 92.5% - 41.5%
5 nothing really changed 
6 this may be some kind of average lifespan of a machine, normally distributed
7 number of zeros are much less from 99.8% - 63.2%
9 really nothing changed

scale time to seconds?

Things to do
1 ,2 ,6 maybe do a scale
2 maybe unskew this
3 label encode

In [18]:
#Encode classification by importance 
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
df_sorted['attribute3'] = encoder.fit_transform(df_sorted['attribute3'].values.reshape(-1, 1))

In [19]:
df_sorted['attribute2']

67274         0
61309         0
61308         0
98175         0
124003        0
77811     62296
6859          0
117380        0
60035         0
6856          0
6855          0
37123         0
16475         0
80595         0
117379        0
124002        0
6849          0
5794          0
5793          0
5792          0
79379        32
6847          0
61300       112
6845          0
60030         0
80592         0
6842        168
124210        0
124209        0
124208        0
          ...  
5836          0
115442        0
109371      240
4673          0
5833          0
5832          0
5831          0
80199         0
5830          0
5829          0
5828          0
80198         0
5827       5160
80197         0
5826        168
5825       1664
5824          0
61870         0
5822          0
5821          0
116298        0
15773         0
70943         0
5817          0
115440        0
5815          0
36421         0
5813          0
80194         0
5812         56
Name: attribute2, Length

In [20]:
from sklearn import preprocessing
#scaling
# Create scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

# Scale feature
df_sorted['attribute1'] = minmax_scale.fit_transform(df_sorted['attribute1'].values.reshape(-1,1))
df_sorted['attribute2'] = minmax_scale.fit_transform(df_sorted['attribute2'].values.reshape(-1,1))
df_sorted['attribute6'] = minmax_scale.fit_transform(df_sorted['attribute6'].values.reshape(-1,1))


In [21]:
df_sorted.head(50)

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9
67274,2015-04-06,Z1F2PBHX,0,0.536402,0.0,0.0,0,5,0.237986,0,0
61309,2015-03-25,Z1F282ZV,0,0.298844,0.0,1.0,0,1,0.279921,0,0
61308,2015-03-25,Z1F26YZB,0,0.94138,0.0,1.0,0,1,0.423769,0,0
98175,2015-06-30,Z1F1VQFY,1,0.73173,0.0,0.0,0,7,0.384973,0,0
124003,2015-10-19,Z1F1VMZB,0,0.398428,0.0,0.0,0,5,0.382348,0,0
77811,2015-05-04,Z1F1RJFA,1,0.506271,0.961477,1.0,9,4,0.488569,0,0
6859,2015-01-06,Z1F1RE71,0,0.318006,0.0,1.0,0,3,0.269669,0,0
117380,2015-09-02,Z1F1R76A,0,0.267344,0.0,0.0,0,8,0.535817,0,12
60035,2015-03-23,Z1F1Q9BD,0,0.635473,0.0,0.0,0,7,0.370651,0,0
6856,2015-01-06,Z1F1HSWK,0,0.493928,0.0,0.0,0,5,0.52528,0,6


## Feature Engineering

In [22]:
import datetime as dt

In [23]:
#convert string to datetime
df_sorted['date'] = pd.to_datetime(df_sorted['date'])
#sort by months, counting how many fails there are for each month
df_sorted.groupby(df_sorted['date'].dt.strftime('%m'))['failure'].sum()

date
01    24
02    14
03     9
04     9
05    18
06     6
07    14
08     4
09     0
10     3
11     0
Name: failure, dtype: int64

In [24]:
seasons_map = {
    'January': 3,
    'February': 3,
    'March': 3,
    'April': 2,
    'May': 2,
    'June': 2,
    'July': 1,
    'August':1,
    'September': 1,
    'October': 0,
    'November': 0,
    'December': 0 
}

In [25]:
df_sorted['seasons']=df_sorted['date'].dt.strftime('%B').map(seasons_map)

In [26]:
df_sorted['date']=df_sorted['date'].dt.strftime('%Y-%m-%d')

In [27]:
df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 67274 to 5812
Data columns (total 12 columns):
date          1168 non-null object
device        1168 non-null object
failure       1168 non-null int64
attribute1    1168 non-null float64
attribute2    1168 non-null float64
attribute3    1168 non-null float64
attribute4    1168 non-null int64
attribute5    1168 non-null object
attribute6    1168 non-null float64
attribute7    1168 non-null object
attribute9    1168 non-null object
seasons       1168 non-null int64
dtypes: float64(4), int64(3), object(5)
memory usage: 118.6+ KB


In [28]:
df_sorted.head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute9,seasons
67274,2015-04-06,Z1F2PBHX,0,0.536402,0.0,0.0,0,5,0.237986,0,0,2
61309,2015-03-25,Z1F282ZV,0,0.298844,0.0,1.0,0,1,0.279921,0,0,3
61308,2015-03-25,Z1F26YZB,0,0.94138,0.0,1.0,0,1,0.423769,0,0,3
98175,2015-06-30,Z1F1VQFY,1,0.73173,0.0,0.0,0,7,0.384973,0,0,2
124003,2015-10-19,Z1F1VMZB,0,0.398428,0.0,0.0,0,5,0.382348,0,0,0


In [29]:
df_sorted.corr()

Unnamed: 0,failure,attribute1,attribute2,attribute3,attribute4,attribute6,seasons
failure,1.0,0.018729,0.171187,0.004743,0.160769,-0.042033,0.031747
attribute1,0.018729,1.0,-0.030013,-0.016392,0.009994,-0.0558,0.024913
attribute2,0.171187,-0.030013,1.0,-0.020197,0.34973,-0.026274,0.021585
attribute3,0.004743,-0.016392,-0.020197,1.0,0.071894,0.060801,0.015964
attribute4,0.160769,0.009994,0.34973,0.071894,1.0,0.001997,0.050672
attribute6,-0.042033,-0.0558,-0.026274,0.060801,0.001997,1.0,-0.250324
seasons,0.031747,0.024913,0.021585,0.015964,0.050672,-0.250324,1.0


## Modeling

In [30]:
#import models
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# from sklearn.cross_validation import KFold, cross_val_score
# from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report 

In [31]:
df_sorted = pd.get_dummies(df_sorted, drop_first = True)
y= df_sorted['failure']
result =df_sorted.drop(columns = ['failure'])

In [32]:
X_train, X_test, y_train, y_test = train_test_split(result, y, test_size=0.25, random_state=42)

In [33]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((876, 1415), (292, 1415), (876,), (292,))

In [34]:
# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

In [35]:
#find the values of fail or not fail
not_fail = X[X.failure==0]
fail = X[X.failure==1]

In [36]:
not_fail.shape, fail.shape

((800, 1416), (76, 1416))

In [37]:
no_failure_downsample = resample(not_fail,
                          replace=True, # sample with replacement
                          n_samples=len(fail), # match number in majority class
                          random_state=27) # reproducible results

In [38]:
no_failure_downsample.shape

(76, 1416)

In [39]:
upsampled = pd.concat([fail, no_failure_downsample])

In [40]:
upsampled.shape

(152, 1416)

In [41]:
#check to see if sample sizes are equal
upsampled['failure'].value_counts()

1    76
0    76
Name: failure, dtype: int64

In [42]:
#separate failure column from dataset for new y_train and X_train
y_train = upsampled['failure']
X_train = upsampled.drop(['failure'], axis=1)

In [43]:
#check to see if column and row shapes are the same
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((152, 1415), (292, 1415), (152,), (292,))

In [44]:
#use SelectFromModel to select the most important features
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100, random_state=0))
sel.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True,
                                                 class_weight=None,
                                                 criterion='gini',
                                                 max_depth=None,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100, n_jobs=None,
                                                 oob_score=False,
                                                 random_state=0, verbose=0,
     

In [45]:
#if the values are false, then those columns are not included in the selected features
sel.get_support()

array([ True,  True,  True, ..., False, False, False])

In [46]:
#checking the index of the columns that are true
selected_feat= X_train.columns[(sel.get_support())]
#finding how many features are selected
len(selected_feat)

246

In [47]:
#the selected feature names
print(selected_feat)

Index(['attribute1', 'attribute2', 'attribute3', 'attribute4', 'attribute6',
       'seasons', 'date_2015-01-05', 'date_2015-01-06', 'date_2015-01-07',
       'date_2015-01-13',
       ...
       'attribute9_1080', 'attribute9_11', 'attribute9_19', 'attribute9_2',
       'attribute9_3', 'attribute9_4', 'attribute9_41', 'attribute9_5',
       'attribute9_51', 'attribute9_7'],
      dtype='object', length=246)


In [48]:
#creating the new data with selected features
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

In [49]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#parameters for tDMassess function
g = GaussianNB()
b = BernoulliNB()
k = KNeighborsClassifier()
log = LogisticRegression()
gbc = GradientBoostingClassifier()
d = DecisionTreeClassifier()
r = RandomForestClassifier()
algorithms = [g, b, k, log, gbc, d, r]
names = ['GaussianNB', 'BernoulliNB', 'K Nearest', 'Logistic', 'Gradient Boosting', 'Single Tree', 'Random Forest']

In [50]:
from sklearn.model_selection import GridSearchCV

In [51]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [52]:
# Create grid search using 5-fold cross validation
clf = GridSearchCV(log, hyperparameters, cv=5, verbose=0)

In [53]:
# Fit grid search
best_model = clf.fit(X_train, y_train)





In [54]:
# Predict target vector
best_model.predict(X_test)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0])

In [55]:
accuracy_score(y_test,best_model.predict(X_test)),precision_score(y_test,best_model.predict(X_test)),recall_score(y_test,best_model.predict(X_test)),f1_score(y_test,best_model.predict(X_test))

(0.9006849315068494, 0.46153846153846156, 0.96, 0.6233766233766234)

In [56]:
#predicts models' accuracy, precision, recall, and f1
def tDMassess(X_train = X_train, X_test = X_test, y_train = y_train, y_test = y_test, algorithms=algorithms , names=names ):
    #fit the data
    for i in range(len(algorithms)):
        algorithms[i]= algorithms[i].fit(X_train,y_train)
    #print metrics
    accuracy = []
    precision= []
    recall= []
    f1 = []
    for i in range(len(algorithms)):
        if accuracy_score(y_test,algorithms[i].predict(X_test)) == 1:
            accuracy.append(0)
        else:
            accuracy.append(accuracy_score(y_test,algorithms[i].predict(X_test)))
        if precision_score(y_test,algorithms[i].predict(X_test)) == 1:
            precision.append(0)
        else:
            precision.append(precision_score(y_test,algorithms[i].predict(X_test)))
        if recall_score(y_test,algorithms[i].predict(X_test)) == 1:
            recall.append(0)
        else:
            recall.append(recall_score(y_test,algorithms[i].predict(X_test)))
        if f1_score(y_test,algorithms[i].predict(X_test)) == 1:
            f1.append(0)
        else:
            f1.append(f1_score(y_test,algorithms[i].predict(X_test)))

    metrics = pd.DataFrame(columns = ['Accuracy', 'Precision', 'Recall', 'F1'], index = names)
    metrics['Accuracy'] =accuracy
    metrics['Precision'] = precision
    metrics['Recall']=recall
    metrics['F1']=f1
    return metrics.sort_values('F1',ascending = False)

In [57]:
tDMassess()



Unnamed: 0,Accuracy,Precision,Recall,F1
BernoulliNB,0.952055,0.703704,0.76,0.730769
Random Forest,0.914384,0.5,0.88,0.637681
Gradient Boosting,0.880137,0.413793,0.96,0.578313
Logistic,0.863014,0.363636,0.8,0.5
GaussianNB,0.934932,0.75,0.36,0.486486
K Nearest,0.835616,0.298246,0.68,0.414634
Single Tree,0.739726,0.213483,0.76,0.333333


In [58]:
## Without breaking numeric down to categorical
# Random Forest	0.914384	0.500000	0.76	0.603175
# Gradient Boosting	0.869863	0.381818	0.84	0.525000
# Logistic	0.712329	0.160920	0.56	0.250000

In [59]:
# #Breaking down with out att4
# Random Forest	0.886986	0.416667	0.80	0.547945
# Gradient Boosting	0.866438	0.383333	0.92	0.541176
# Logistic	0.712329	0.160920	0.56	0.250000

In [60]:
# ##Break down with attr4
# Random Forest	0.866438	0.360000	0.72	0.480000
# Gradient Boosting	0.797945	0.292683	0.96	0.448598
# Logistic	0.712329	0.160920	0.56	0.250000

In [61]:
# # ##With ordinal encoding
# Gradient Boosting	0.869863	0.393443	0.96	0.558140
# Random Forest	0.904110	0.459459	0.68	0.548387
# Logistic	0.712329	0.160920	0.56	0.250000

In [62]:
# #scaled attribute 2
# Random Forest	0.893836	0.440000	0.88	0.586667
# Gradient Boosting	0.869863	0.389831	0.92	0.547619
# Logistic	0.551370	0.104478	0.56	0.176101

In [63]:
# #scaled attribute 1 
# Gradient Boosting	0.869863	0.393443	0.96	0.558140
# Random Forest	0.845890	0.333333	0.80	0.470588
# Logistic	0.883562	0.384615	0.60	0.468750

In [64]:
# #scaled attribute 6
# Random Forest	0.890411	0.428571	0.84	0.567568
# Gradient Boosting	0.869863	0.389831	0.92	0.547619
# Logistic	0.085616	0.085616	0.00	0.157729


In [65]:
# # scaled attribute 1,2,6 
# Logistic	0.873288	0.384615	0.80	0.519481
# Gradient Boosting	0.849315	0.353846	0.92	0.511111
# Random Forest	0.849315	0.344262	0.84	0.488372
#with hypertuning
# (0.9006849315068494, 0.4523809523809524, 0.76, 0.5671641791044777)

In [66]:
# #added seasons feature engineer
# Gradient Boosting	0.886986	0.428571	0.96	0.592593
# Random Forest	0.863014	0.363636	0.80	0.500000
# Logistic	0.863014	0.363636	0.80	0.500000

In [67]:
# df[df['failure'] ==1 ].groupby(['device','date']).first()

In [68]:
selected_feat[100:200]

Index(['device_S1F0R1GK', 'device_S1F0R3N9', 'device_S1F0R4JP',
       'device_S1F0RR35', 'device_S1F0RSZP', 'device_S1F0S2WJ',
       'device_S1F0S57T', 'device_S1F0S5LP', 'device_S1F0S65X',
       'device_S1F0S6AB', 'device_S1F0SAC1', 'device_S1F0T2LA',
       'device_S1F0TMLM', 'device_S1F0TNW4', 'device_S1F0W45F',
       'device_S1F10E6M', 'device_S1F10NDB', 'device_S1F10QL8',
       'device_S1F10RKC', 'device_S1F11MB0', 'device_S1F12ZQB',
       'device_S1F130GX', 'device_S1F13589', 'device_S1F135TN',
       'device_S1F136BS', 'device_S1F13H80', 'device_S1F13HPP',
       'device_S1F13JW4', 'device_S1F13KG3', 'device_S1F13M9A',
       'device_W1F08EDA', 'device_W1F0BJ6E', 'device_W1F0FW0S',
       'device_W1F0KCR5', 'device_W1F0NNAH', 'device_W1F0NZZZ',
       'device_W1F0P24E', 'device_W1F0PAXH', 'device_W1F0PNA5',
       'device_W1F0SGHR', 'device_W1F0T074', 'device_W1F0T0B1',
       'device_W1F0TA59', 'device_W1F0VC83', 'device_W1F0VL6W',
       'device_W1F0WBTM', 'device_W1F0X4

In [69]:
pd.set_option('display.max_rows', 200)