# GOAL
#### You are tasked with building a predictive model using machine learning to predict the probability of a device failure. When building this model, be sure to minimize false positives and false negatives. The column you are trying to predict is called failure with binary value 0 for non-failure and 1 for failure.

In [1]:
import time
import pandas as pd 
import numpy as np 
import plotly.express as px 

import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv("predictive_maintenance.csv")
df.head()

Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
0,1/1/15,S1F01085,0,215630672,56,0,52,6,407438,0,0,7
1,1/1/15,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,1/1/15,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,1/1/15,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,1/1/15,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


In [3]:
df.dtypes

date       object
device     object
failure     int64
metric1     int64
metric2     int64
metric3     int64
metric4     int64
metric5     int64
metric6     int64
metric7     int64
metric8     int64
metric9     int64
dtype: object

In [4]:
df.shape

(124494, 12)

In [5]:
df.describe()

Unnamed: 0,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
count,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0,124494.0
mean,0.000851,122388100.0,159.484762,9.940455,1.74112,14.222669,260172.657726,0.292528,0.292528,12.451524
std,0.029167,70459330.0,2179.65773,185.747321,22.908507,15.943028,99151.078547,7.436924,7.436924,191.425623
min,0.0,0.0,0.0,0.0,0.0,1.0,8.0,0.0,0.0,0.0
25%,0.0,61284760.0,0.0,0.0,0.0,8.0,221452.0,0.0,0.0,0.0
50%,0.0,122797400.0,0.0,0.0,0.0,10.0,249799.5,0.0,0.0,0.0
75%,0.0,183309600.0,0.0,0.0,0.0,12.0,310266.0,0.0,0.0,0.0
max,1.0,244140500.0,64968.0,24929.0,1666.0,98.0,689161.0,832.0,832.0,18701.0


In [6]:
df.isnull().sum()

date       0
device     0
failure    0
metric1    0
metric2    0
metric3    0
metric4    0
metric5    0
metric6    0
metric7    0
metric8    0
metric9    0
dtype: int64

In [7]:
for col in df.columns:
    x = df[col].nunique()
    print(f'{col} : {x}')

date : 304
device : 1169
failure : 2
metric1 : 123877
metric2 : 558
metric3 : 47
metric4 : 115
metric5 : 60
metric6 : 44838
metric7 : 28
metric8 : 28
metric9 : 65


## Data Preprocessing, Statistical Analysis


In [8]:
# Checking Balance
px.histogram(df,x='failure')

** Highly imbalanced dataset

1. Resampling

In [9]:
# MAking classes
# Class count
count_0, count_1 = df['failure'].value_counts()

# Divide by class
df_0 = df[df['failure'] == 0]
df_1 = df[df['failure'] == 1]

In [13]:
# Under-SAmpling 
df_0_under = df_0.sample(count_1)
df_under = pd.concat([df_0_under, df_1], axis=0)

print('Random under-sampling:')
print(df_under.failure.value_counts())

Random under-sampling:
0    106
1    106
Name: failure, dtype: int64


In [15]:
# Over-Sampling 
df_1_over = df_1.sample(count_0,replace=True) #replace is imp in over.s
df_over = pd.concat([df_1_over, df_0], axis=0)

print('Random under-sampling:')
print(df_over.failure.value_counts())

Random under-sampling:
0    124388
1    124388
Name: failure, dtype: int64


In [16]:
df_over.head()

Unnamed: 0,date,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
124329,10/26/15,W1F0T0B1,1,95073232,0,0,7,9,354861,22,22,0
19059,1/23/15,W1F0X4FC,1,64408168,0,0,0,7,245849,48,48,0
80984,5/13/15,S1F0GKL6,1,160459104,0,0,2,90,249366,0,0,0
15859,1/19/15,S1F0QY11,1,159635352,0,0,9,7,231336,16,16,0
47859,3/5/15,Z1F130LH,1,171736696,2464,0,0,24,254957,0,0,0


In [17]:
# Proceeding with Over-sampling

from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
df_over['device'] = label.fit_transform(df_over['device'])


In [23]:
df_over.iloc[:,3:]

Unnamed: 0,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
124329,95073232,0,0,7,9,354861,22,22,0
19059,64408168,0,0,0,7,245849,48,48,0
80984,160459104,0,0,2,90,249366,0,0,0
15859,159635352,0,0,9,7,231336,16,16,0
47859,171736696,2464,0,0,24,254957,0,0,0
...,...,...,...,...,...,...,...,...,...
124489,18310224,0,0,0,10,353705,8,8,0
124490,172556680,96,107,4,11,332792,0,0,13
124491,19029120,4832,0,0,11,350410,0,0,0
124492,226953408,0,0,0,12,358980,0,0,0


VIF starts at 1 and has no upper limit
VIF = 1, no correlation between the independent variable and the other variables
VIF exceeding 5 or 10 indicates high multicollinearity between this independent variable and the others

In [25]:
# checking multicolinearity

from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):
    
    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

X = df_over.iloc[:,3:]
calc_vif(X)

Unnamed: 0,variables,VIF
0,metric1,0.001365
1,metric2,1.233926
2,metric3,1.246955
3,metric4,1.225166
4,metric5,1.798083
5,metric6,3.145725
6,metric7,inf
7,metric8,inf
8,metric9,1.333091


* Not any multicollinerity found

In [20]:
df_over.corr()

Unnamed: 0,device,failure,metric1,metric2,metric3,metric4,metric5,metric6,metric7,metric8,metric9
device,1.0,-0.043712,-0.054741,-0.04594,-0.009609,-0.01728,-0.118871,-0.220741,-0.003334,-0.003334,0.021941
failure,-0.043712,1.0,0.036052,0.211654,-0.023055,0.189378,0.040747,-0.010545,0.180867,0.180867,0.030061
metric1,-0.054741,0.036052,1.0,-0.083436,0.007453,-0.039223,0.06832,-0.063765,0.150615,0.150615,0.051891
metric2,-0.04594,0.211654,-0.083436,1.0,-0.011269,0.390773,-0.010881,-0.059148,0.015449,0.015449,-0.021481
metric3,-0.009609,-0.023055,0.007453,-0.011269,1.0,0.002692,-0.013172,0.014826,-0.007055,-0.007055,0.428703
metric4,-0.01728,0.189378,-0.039223,0.390773,0.002692,1.0,-0.008099,-0.044345,0.014767,0.014767,-0.009761
metric5,-0.118871,0.040747,0.06832,-0.010881,-0.013172,-0.008099,1.0,0.002043,-0.007068,-0.007068,-0.003148
metric6,-0.220741,-0.010545,-0.063765,-0.059148,0.014826,-0.044345,0.002043,1.0,-0.10746,-0.10746,0.041764
metric7,-0.003334,0.180867,0.150615,0.015449,-0.007055,0.014767,-0.007068,-0.10746,1.0,1.0,0.233013
metric8,-0.003334,0.180867,0.150615,0.015449,-0.007055,0.014767,-0.007068,-0.10746,1.0,1.0,0.233013


In [18]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split

x= df_over.drop(columns=['date','failure'])
y = df_over['failure']

x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.25,random_state=44)

# Feature Scaling
scale = StandardScaler()
x_train = scale.fit_transform(x_train)
x_test = scale.fit_transform(x_test)

# Model Building

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

model = LogisticRegression()
model.fit(x_train,y_train)

model2 = RandomForestClassifier()
model2.fit(x_train,y_train)

model3 = DecisionTreeClassifier()
model3.fit(x_train,y_train)

model4 = XGBClassifier()
model4.fit(x_train,y_train)

model5 = LGBMClassifier()
model5.fit(x_train,y_train)



LGBMClassifier()

In [28]:
# Evaluation of models

y_t = model.predict(x_train)

y_pred1 = model.predict(x_test)
y_pred2 = model2.predict(x_test)
y_pred3 = model3.predict(x_test)
y_pred4 = model4.predict(x_test)
y_pred5 = model5.predict(x_test)


In [30]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score,classification_report,confusion_matrix


In [34]:
## Checking Overfitting/ Underfitting
print(f1_score(y_t,y_train))
print(f1_score(y_pred1,y_test))

0.691412802337536
0.6979467371416954


In [35]:
print(f1_score(y_pred1,y_test))
print(f1_score(y_pred2,y_test))
print(f1_score(y_pred3,y_test))
print(f1_score(y_pred4,y_test))
print(f1_score(y_pred5,y_test))


0.6979467371416954
0.7126010500875073
0.7298024950161334
0.8509981783709432
0.922360140467995


In [36]:
# Checking COnfusion Matrix for all

print(confusion_matrix(y_pred4,y_test))
print(confusion_matrix(y_pred3,y_test))
print(confusion_matrix(y_pred2,y_test))
print(confusion_matrix(y_pred1,y_test))
print(confusion_matrix(y_pred5,y_test))

[[31287  7996]
 [   20 22891]]
[[31292 13132]
 [   15 17755]]
[[31299 13786]
 [    8 17101]]
[[30170 13721]
 [ 1137 17166]]
[[31200  4359]
 [  107 26528]]


In [37]:
print(classification_report(y_pred5,y_test))

              precision    recall  f1-score   support

           0       1.00      0.88      0.93     35559
           1       0.86      1.00      0.92     26635

    accuracy                           0.93     62194
   macro avg       0.93      0.94      0.93     62194
weighted avg       0.94      0.93      0.93     62194



In [50]:
# Cross-validations

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold,KFold

skfold=StratifiedKFold(n_splits=5)
scores=cross_val_score(model5,x_test,y_test,cv=skfold)
print(np.mean(scores))

0.9970415085336655


# Thank you

Final model will be LightGBMClassifier with 99.7% training Accuracy and 93% test accuracy