# importing libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, auc
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectKBest, chi2 ,f_classif
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
df=pd.read_csv(r'C:\Users\sanil\Desktop\aps\aps_failure_training_set_SMALLER.csv')
df.head(10)

In [None]:
df_test=pd.read_csv(r'C:\Users\sanil\Desktop\aps\aps_failure_test_set.csv')
df_test.head(10)
df_test['class'].value_counts()

In [None]:
x_train=df.drop('class',axis=1)
y_train=df.loc[:,'class']
x_test=df_test.drop('class',axis=1)
y_test=df_test['class']

In [None]:
y_train.replace('neg',0,inplace=True)
y_train.replace('pos',1,inplace=True)
y_test.replace('neg',0,inplace=True)
y_test.replace('pos',1,inplace=True)

In [None]:
x_train.head(10)

In [None]:
#drop columns with many na values
def drop_na_val(x_train,x_test,*args):
    temp1=x_train
    temp2=x_test
    for column in x_train:
        temp_train=x_train[column].value_counts().index.tolist()
        temp_test=x_test[column].value_counts().index.tolist()
        if 'na' in temp_train and 'na'in temp_test:   
            counts1 =temp1[column].value_counts().to_dict()
            counts2 =temp2[column].value_counts().to_dict()
            if counts1['na']>1000 and counts2['na']>1000:
                temp1=temp1.drop(column,axis=1)
                temp2=temp2.drop(column,axis=1)
    return temp1,temp2           

In [None]:
x_train,x_test=drop_na_val(x_train,x_test)

In [None]:
x_train.head(10)

In [None]:
x_test.head(10)

In [None]:
def replace_na(x_train,*args):
    count=0
    for column in x_train:
    #x_train[column]=pd.to_numeric(x_train.loc[x_train[column]!='na',column])
        temp1=x_train[column].value_counts().index.tolist()
        if 'na' in temp1:
            count+=1
            temp=list(x_train.loc[x_train[column]!='na',column])
            temp=list(map(float,temp))
            temp=np.array(temp)
            #mean=x_train.loc[x_train[column]!='na',column].mean()
            mean=np.average(temp)
            x_train[column].replace('na',mean,inplace=True)
        x_train[column]=pd.to_numeric(x_train[column])
    print("new number of columns = {0}".format(count))    
    return x_train    

In [None]:
x_train=replace_na(x_train)
x_test=replace_na(x_test)

In [None]:
#y_test.columns=['class']
y_train.value_counts()

# Normalization and Standardization (Run only one of the two blocks)

In [None]:
#standardization
scaler=StandardScaler()
scaler.fit(x_train)
X=pd.DataFrame(scaler.transform(x_train))
X.head(10)

In [None]:
#Normalization
scaler=MinMaxScaler()
scaler.fit(x_train)
X=pd.DataFrame(scaler.transform(x_train))
X.head(10)

#  Class unbalance histogram

In [None]:
counts=df['class'].value_counts().tolist()
sns.set(style="darkgrid")
#titanic = sns.load_dataset("titanic")
ax = sns.countplot(x="class",data=df,order=df['class'].value_counts().index.tolist())
plt.show()

In [None]:
#corr=x_train.corr()
corr = X.corr()
sns.heatmap(corr)

In [None]:
X_new_train = SelectKBest(f_classif, k=30).fit_transform(x_train,y_train)

In [None]:
X_new_train=pd.DataFrame(X_new_train)
corr = X_new_train.corr()
sns.heatmap(corr)

In [None]:
print(len(X_new_train[0]))

# Synthetic Minority Oversampling technique(Smote) to deal with unbalanced data

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0,ratio=1)

In [None]:
#Fit SMOTE and BALANCE CLASSES

smote_fit = smote.fit_sample(X,y_train)
X_smote= pd.DataFrame(smote_fit[0])
Y_smote= pd.DataFrame(smote_fit[1],columns=['class'])
aps_train_smote = pd.concat([Y_smote,X_smote],axis=1)
temp=x_train.columns
aps_train_smote.columns=['class',*temp]

In [None]:
aps_train_smote.head()

In [None]:
pd.value_counts(aps_train_smote["class"])
counts=aps_train_smote['class'].value_counts().tolist()
sns.set(style="darkgrid")
ax = sns.countplot(x="class",data=aps_train_smote,order=aps_train_smote['class'].value_counts().index.tolist())
plt.show()

# Preparing test data for classification    0='neg'   1='pos'

In [None]:
#Training data preparation
y=list(Y_smote['class'])
y=list(map(int,y))

In [None]:
#Normalizing test data 
scaler.fit(x_test)
X_test=pd.DataFrame(scaler.transform(x_test))
X_test.head(10)
Y_test=list(y_test)

In [None]:
#use pca data(optional)
pca=PCA(n_components=30)
x_train_pca=pca.fit_transform(X_smote)
x_test_pca=pca.transform(X_test)

# ROC and AUC 

In [None]:
def roc(actual,predictions,*args):
    actual=list(map(int,actual))
    predictions=list(map(int,predictions))                                  
    fpr, tpr, threshold = roc_curve(actual, predicions) #referenced code for ROC from https://stackoverflow.com/questions/25009284/how-to-plot-roc-curve-in-python
    roc_auc =auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic')                        
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc) 
    plt.legend(loc = 'lower right') 
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

# Weighted error measure 'Cost_1=10  Cost_2=500' 

In [None]:
def weighted_error(y_true,y_pred):
    cnm=confusion_matrix(y_true,y_pred) 
    error=10*cnm[0][1]+500*cnm[1][0]
    print("weighted error: {0}".format(error))
    print(" Total F1 score: {0}".format(f1_score(y_true,y_pred,average='micro')))
    print(" Positive class F1 score: {0}".format(f1_score(y_true,y_pred)))

# Confusion matrix

In [None]:
def cnm(y_true,pred):
    cnm_matrix=confusion_matrix(y_true,pred) 
    sns.heatmap(cnm_matrix,annot=True,fmt="d")

# SVM Hyperparameter Optimization using Stratified K fold Cross validation

In [None]:
#SVM with PCA
param_grid ={'C':[1,3,5],'gamma':np.logspace(-3,3,3)}
svm=SVC()
grid = GridSearchCV(svm, param_grid, cv=3,scoring='f1_micro',verbose=30)
print("Done")
grid.fit(x_train_pca,y)

In [None]:
temp_x=np.array(X_smote)
temp_y=np.array(Y_smote)
print(temp_y.shape)

In [None]:
#SVM PCA
svm=SVC(C=5,gamma=1)
svm.fit(x_train_pca,y)
acc=svm.score(x_train_pca,y)
pred_train_svm=svm.predict(x_train_pca)
print("accuracy of training set SVM: {0}".format(acc))
roc(y,pred_train_svm)

weighted_error(y,pred_train_svm)

In [None]:
cnm(y,pred_train_svm)

In [None]:
#SVM TEST PCA
acc_test=svm.score(x_test_pca,Y_test)
print("accuracy of test set SVM: {0}".format(acc_test))

acc=svm.score(x_test_pca,Y_test)
pred_test_svm=svm.predict(x_test_pca)
print("accuracy of training set SVM: {0}".format(acc))
roc(Y_test,pred_test_svm)
cnm(Y_test,pred_test_svm)
weighted_error(Y_test,pred_test_svm)

In [None]:
#SVM W/O PCA
svm=SVC(C=5,gamma=1)
svm.fit(X_smote,y)
acc=svm.score(X_smote,y)
pred_train_svm=svm.predict(X_smote)
print("accuracy of training set SVM: {0}".format(acc))
roc(y,pred_train_svm)

weighted_error(y,pred_train_svm)

In [None]:
#SVM TEST W/O PCA
acc_test=svm.score(X_test,Y_test)
print("accuracy of test set SVM: {0}".format(acc_test))

acc=svm.score(X_test,Y_test)
pred_test_svm=svm.predict(X_test)
print("accuracy of training set SVM: {0}".format(acc))
roc(Y_test,pred_test_svm)
cnm(Y_test,pred_test_svm)
weighted_error(Y_test,pred_test_svm)

# Gaussian NB classifier

In [None]:
gnb=GaussianNB()
gnb.fit(X_smote,y)
acc=gnb.score(X_smote,y)
predictions=gnb.predict(X_smote)
roc(y,predictions)
print("accuracy of Gaussian NB on train set: {0} %".format(acc*100))

cnm_gnb_train=confusion_matrix(y,predictions) 
sns.heatmap(cnm_gnb_train,annot=True,fmt="d")
weighted_error(y,predictions)

In [None]:
gnb_test_acc=gnb.score(X_test,Y_test)
gnb_pred_test=gnb.predict(X_test)
print("accuracy of Gaussian NB on test set: {0} %".format(gnb_test_acc*100))
roc(Y_test,gnb_pred_test)
cnm_test=confusion_matrix(Y_test,gnb_pred_test) 
sns.heatmap(cnm_test,annot=True,fmt="d")
weighted_error(Y_test,gnb_pred_test)    

In [None]:

#PCA
gnb=GaussianNB()
gnb.fit(x_train_pca,y)
acc=gnb.score(x_train_pca,y)
predictions=gnb.predict(x_train_pca)
print("accuracy of Gaussian NB on train set with pca: {0} %".format(acc*100))
roc(y,predictions)

cnm_pca_gnb_train=confusion_matrix(y,predictions) 
sns.heatmap(cnm_pca_gnb_train,annot=True,fmt="d")

weighted_error(y,predictions)

In [None]:
acc=gnb.score(x_test_pca,Y_test)
predictions=gnb.predict(x_test_pca)
print("accuracy of Gaussian NB on test set with pca: {0} %".format(acc*100))

roc(Y_test,predictions)

cnm_pca_gnb_test=confusion_matrix(Y_test,predictions) 
sns.heatmap(cnm_pca_gnb_test,annot=True,fmt="d")

weighted_error(Y_test,predictions)

# KNN Classifier


In [None]:
#cross validation
k=[1,2,3,4,5]
acc=[]

for i in k:
    knn=KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, x_train_pca,y, cv=5, scoring='accuracy')
    acc.append(scores.mean())
print("best k:{0}".format(k[k.index(max(acc))]))
print("accuracy:{0}".format(max(acc)))      

In [None]:

k=[1,2,3,4,5]
acc=[]

y1=np.array(y)
for i in k:
    knn=KNeighborsClassifier(n_neighbors=k)
    temp=np.array([])
    for train_index, test_index in skf.split(x_train_pca, y1):
    #print("TRAIN:", train_index, "TEST:", test_index)
        
        X_train_skf, X_test_skf = x_train_pca[train_index], x_train_pca[test_index]
        y_train_skf, y_test_skf = y1[train_index], y1[test_index]
        knn.fit(X_train_skf,y_train_skf)
        temp.append(knn.score(X_test_skf,y_test_skf))
    print("for k={0} accuracies are {1}".format(i,temp))    
    mean_acc=np.append(np.average(temp))    
    

In [None]:
#finiding best k for PCA
k=[1,2,3,4,5,6,7,8,9,10]
acc=[]

for i in k:
    knn=KNeighborsClassifier(n_neighbors=i)
    knn.fit(x_train_pca,y)
    acc.append(knn.score(x_test_pca,Y_test))
    print("for k={0} accuracy is {1} %".format(i,knn.score(x_test_pca,Y_test)*100))     
print("best k: {0}".format(k[acc.index(max(acc))]))   

In [None]:
#TRAIN SET PCA ANALYSIS
knn=KNeighborsClassifier(n_neighbors=2)
knn.fit(x_train_pca,y)
acc=knn.score(x_train_pca,y)
print("accuracy of knn on the train set with smote: {0} %".format(acc*100))
knn_pred=knn.predict(x_train_pca)
roc(y,knn_pred)

cnm_knn_train=confusion_matrix(y,knn_pred) 
sns.heatmap(cnm_knn_train,annot=True,fmt="d")
print(cnm_knn_train)

weighted_error(y,knn_pred)

In [None]:
#KNN PCA TEST SET
knn_pred_test=knn.predict(x_test_pca)
roc(Y_test,knn_pred_test)

cnm_test_knn=confusion_matrix(Y_test,knn_pred_test) 
sns.heatmap(cnm_test_knn,annot=True,fmt="d")
print(cnm_test_knn)

weighted_error(Y_test,knn_pred_test)
print("accuracy of knn on the test set with PCA: {0} %".format(knn.score(x_test_pca,Y_test)*100))

In [None]:
#KNN WITHOUT PCA
knn=KNeighborsClassifier(n_neighbors=2)
knn.fit(X_smote,y)
acc=knn.score(X_smote,y)
print("accuracy of knn on the train set with smote: {0} %".format(acc*100))


In [None]:
#TRAIN ANALYSIS WITHOUT PCA
knn_pred=knn.predict(X_smote)
roc(y,knn_pred)

cnm_knn_train=confusion_matrix(y,knn_pred) 
sns.heatmap(cnm_knn_train,annot=True,fmt="d")
print(cnm_knn_train)

weighted_error(y,knn_pred)

In [None]:
#KNN TEST WITHOUT PCA
knn_pred_test=knn.predict(X_test)
roc(Y_test,knn_pred_test)

cnm_test_knn=confusion_matrix(Y_test,knn_pred_test) 
sns.heatmap(cnm_test_knn,annot=True,fmt="d")
print(cnm_test_knn)
print("accuracy: {0}".format(knn.score(X_test,Y_test)))
weighted_error(Y_test,knn_pred_test)

In [None]:
temp=list(map(int,knn_pred_test))
print("F1 score for the test set is: {0}".format(f1_score(Y_test,temp)))

# Random forest classifier

In [None]:
param_grid ={'n_estimators':[20,40,60,80,100],'max_depth':[1,2,3,4,5]}
rf=RandomForestClassifier()
grid = GridSearchCV(rf, param_grid, cv=5,scoring='f1_micro',verbose=10)
print("Done")
grid.fit(x_train_pca, y)

In [None]:
grid.best_params_

In [None]:
rf=RandomForestClassifier(n_estimators=40, max_depth=5,random_state=0)
rf.fit(X_smote,y)
acc_rf_train=rf.score(X_smote,y)
print("accuracy of Random Forest on the train set with smote: {0} %".format(acc_rf_train*100))

In [None]:
rf_pred_train=rf.predict(X_smote)
roc(y,rf_pred_train)

cnm_train_rf=confusion_matrix(y,rf_pred_train) 
sns.heatmap(cnm_train_rf,annot=True,fmt="d")
print(cnm_train_rf)

In [None]:
pred_test_rf=rf.predict(X_test)
roc(Y_test,pred_test_rf)

cnm_test_rf=confusion_matrix(Y_test,pred_test_rf) 
sns.heatmap(cnm_test_rf,annot=True,fmt="d")
print("accuracy of Random Forest test set : {0} %".format(rf.score(X_test,Y_test)*100))
weighted_error(Y_test,pred_test_rf)
print(cnm_test_rf)

In [None]:
#WITH PCA
rf=RandomForestClassifier(n_estimators=40, max_depth=5,random_state=0)
rf.fit(x_train_pca,y)
acc_rf_train=rf.score(x_train_pca,y)
print("accuracy of Random Forest on the train set with smote: {0} %".format(acc_rf_train*100))

In [None]:
#PCA
rf_pred_train=rf.predict(x_train_pca)
roc(y,rf_pred_train)
cnm(y,rf_pred_train)

In [None]:
pred_test_rf=rf.predict(x_test_pca)
roc(Y_test,pred_test_rf)
print("accuracy of Random Forest on the test set pca: {0} %".format(rf.score(x_test_pca,Y_test)*100))
cnm_test_rf=confusion_matrix(Y_test,pred_test_rf) 
sns.heatmap(cnm_test_rf,annot=True,fmt="d")
weighted_error(Y_test,pred_test_rf)

# Perceptron

In [None]:
param_grid ={'alpha':[0.0001,0.001,0.01,0.1,1],'eta0':[0.001,0.01,0.1,1,10]}
clf=Perceptron(shuffle=True)
grid = GridSearchCV(clf, param_grid, cv=3,scoring='f1_micro',verbose=30)
print("Done")
grid.fit(x_train_pca,y)

In [None]:
grid.best_params_

In [None]:
#PERCEPTRON WITH PCA
clf=Perceptron(alpha=0.0001,eta0=0.001,shuffle=True)
clf.fit(x_train_pca,y)
acc=clf.score(x_train_pca,y)
print("accuracy of perceptron on the train set with pca: {0} %".format(acc*100))
pred=clf.predict(x_train_pca)
roc(y,pred)
cnm(y,pred)
weighted_error(y,pred)

In [None]:
#Test PCA
pred_test=clf.predict(x_test_pca)
acc=clf.score(x_test_pca,Y_test)
print("accuracy of perceptron on the test set with pca: {0} %".format(acc*100))
roc(Y_test,pred_test)
cnm(Y_test,pred_test)
weighted_error(Y_test,pred_test)                      

In [None]:
#PERCEPTRON WITHOUT PCA
clf=Perceptron(alpha=0.0001,eta0=0.001,shuffle=True)
clf.fit(X_smote,y)
acc=clf.score(X_smote,y)
print("accuracy of perceptron on the train set without pca: {0} %".format(acc*100))
pred=clf.predict(X_smote)
roc(y,pred)
cnm(y,pred)
weighted_error(y,pred)

In [None]:
#Test PCA
pred_test=clf.predict(X_test)
acc=clf.score(X_test,Y_test)
print("accuracy of perceptron on the test set without pca: {0} %".format(acc*100))
roc(Y_test,pred_test)
cnm(Y_test,pred_test)
weighted_error(Y_test,pred_test)  