In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("failure.csv")

In [3]:
df.nunique()

date             304
device          1168
failure            2
attribute1    123878
attribute2       558
attribute3        47
attribute4       115
attribute5        60
attribute6     44838
attribute7        28
attribute8        28
attribute9        65
dtype: int64

In [4]:
df.columns

Index(['date', 'device', 'failure', 'attribute1', 'attribute2', 'attribute3',
       'attribute4', 'attribute5', 'attribute6', 'attribute7', 'attribute8',
       'attribute9'],
      dtype='object')

In [5]:
df=df.astype({'attribute2':"category", 'attribute3':"category",'attribute4':"category",
 'attribute5':"category",'attribute7':"category", 'attribute8':"category",
 'attribute9':"category"})

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   date        124494 non-null  object  
 1   device      124494 non-null  object  
 2   failure     124494 non-null  int64   
 3   attribute1  124494 non-null  int64   
 4   attribute2  124494 non-null  category
 5   attribute3  124494 non-null  category
 6   attribute4  124494 non-null  category
 7   attribute5  124494 non-null  category
 8   attribute6  124494 non-null  int64   
 9   attribute7  124494 non-null  category
 10  attribute8  124494 non-null  category
 11  attribute9  124494 non-null  category
dtypes: category(7), int64(3), object(2)
memory usage: 5.7+ MB


In [7]:
df= df.drop(columns=["device", "date"])
df_dummied= pd.get_dummies(df)

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit(df_dummied)
PCA(n_components=1)
pca.explained_variance_ratio_.sum()

0.9999999999999987

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   failure     124494 non-null  int64   
 1   attribute1  124494 non-null  int64   
 2   attribute2  124494 non-null  category
 3   attribute3  124494 non-null  category
 4   attribute4  124494 non-null  category
 5   attribute5  124494 non-null  category
 6   attribute6  124494 non-null  int64   
 7   attribute7  124494 non-null  category
 8   attribute8  124494 non-null  category
 9   attribute9  124494 non-null  category
dtypes: category(7), int64(3)
memory usage: 3.8 MB


In [10]:
df.columns

Index(['failure', 'attribute1', 'attribute2', 'attribute3', 'attribute4',
       'attribute5', 'attribute6', 'attribute7', 'attribute8', 'attribute9'],
      dtype='object')

In [11]:
from sklearn.preprocessing import normalize, scale, MinMaxScaler
df_scaled= df_dummied.copy()
dftoscale= df_scaled[["attribute1", "attribute6"]]
df_scaled[['attribute1','attribute6']]=MinMaxScaler().fit_transform(dftoscale)

In [12]:
df_scaled

Unnamed: 0,failure,attribute1,attribute6,attribute2_0,attribute2_8,attribute2_16,attribute2_24,attribute2_32,attribute2_40,attribute2_48,...,attribute9_1165,attribute9_1864,attribute9_2269,attribute9_2270,attribute9_2522,attribute9_2637,attribute9_2794,attribute9_7226,attribute9_10137,attribute9_18701
0,0,0.883224,0.591204,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0.251374,0.585017,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0.709821,0.344461,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0.326427,0.595191,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0.556935,0.454420,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124489,0,0.074999,0.513234,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124490,0,0.706793,0.482888,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124491,0,0.077943,0.508453,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124492,0,0.929602,0.520889,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
data= df_scaled

In [14]:
data["failure"].value_counts()

0    124388
1       106
Name: failure, dtype: int64

In [15]:
from sklearn.utils import resample
#create two different dataframe of majority and minority class 
data_majority = data[(data['failure']==0)] 
data_minority = data[(data['failure']==1)] 
# upsample minority class
data_minority_oversampled = resample(data_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples= 31000, # to match majority class
                                 random_state=42)  # reproducible results
# Combine majority class with upsampled minority class
data_oversampled = pd.concat([data_minority_oversampled, data_majority])

In [16]:
dfinal=data_oversampled.sort_index().reset_index()

In [17]:
dfinal.to_csv("dfinal.csv")

In [18]:
y, X= dfinal["failure"], dfinal.drop(["failure"], axis=1)

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=101)

In [20]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(class_weight={0:1,1:1000})

In [21]:
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

## BALANCED BAG

# ---------------------------------------------------------------------

In [23]:
R= RidgeClassifier()
GNB= GaussianNB()
BNB= BernoulliNB()
KNC= KNeighborsClassifier(weights="distance")
DTC= DecisionTreeClassifier()
RFC= RandomForestClassifier() 
SVC= SVC()
mlist= [R,KNC,DTC,RFC]

In [24]:
 def classify():
        for classifier in mlist:
            classifier.fit(X_train,y_train)
            y_pred= classifier.predict(X_test)
            print(accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred),
                  f1_score(y_test, y_pred))

In [25]:
classify()

0.952068370787963 0.9187294319645156 0.8322747893713545 0.8733677910772578
0.998455479187582 0.992282958199357 1.0 0.9961265332472563
0.9989188354313074 0.9945855356452237 1.0 0.9972854188210962
0.9995881277833553 0.9979304100375114 1.0 0.9989641331088955
