In [None]:
import os
import wget
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer  # unused, but needed
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
from imblearn.over_sampling import SMOTE


In [None]:
#download data
training_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_training_set.csv'
wget.download(training_url, 'aps_failure_training_set.csv')

test_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_test_set.csv'
wget.download(test_url, 'aps_failure_test_set.csv')

#save as data frame
train_aps_failure = pd.read_csv('aps_failure_training_set.csv', skiprows = 20, header = 0)
train_aps_failure = train_aps_failure.replace('na', np.nan)
test_aps_failure = pd.read_csv('aps_failure_test_set.csv', skiprows = 20)
test_aps_failure = test_aps_failure.replace('na', np.nan)

In [None]:
#data imputation function
def impute_data(data, class_col):

    imputer = SimpleImputer()
    data_imputed = imputer.fit_transform(data.loc[:,data.columns.difference([class_col])])
    imputed_df = pd.DataFrame(data.loc[:,class_col]).join(pd.DataFrame(data_imputed, columns = data.columns.difference([class_col])))

    return imputed_df

#imputing train, test data
train_aps_failure_imputed = impute_data(train_aps_failure, 'class')
test_aps_failure_imputed = impute_data(test_aps_failure, 'class')


In [None]:
#changing class pos = 1, neg = 0
train_aps_failure_imputed.loc[train_aps_failure['class'] == 'neg' , 'class'] = 0
train_aps_failure_imputed.loc[train_aps_failure['class'] == 'pos' , 'class'] = 1

test_aps_failure_imputed.loc[test_aps_failure['class'] == 'neg' , 'class'] = 0
test_aps_failure_imputed.loc[test_aps_failure['class'] == 'pos' , 'class'] = 1

#save class / variables
train_aps_class = train_aps_failure_imputed.loc[:,'class']
train_aps_variables = train_aps_failure_imputed.loc[:, train_aps_failure_imputed.columns.difference(['class'])]

test_aps_class = test_aps_failure_imputed.loc[:,'class']
test_aps_variables = test_aps_failure_imputed.loc[:, test_aps_failure_imputed.columns.difference(['class'])]


In [None]:
#correlation matrix of features
plt.figure(figsize=(12,12))
plot_corr_mat = sns.heatmap(train_aps_failure_imputed.corr(),
                                                    vmin=-1,
                                                    cmap='PiYG',
                                                    annot=False);

In [None]:
#calculate CV (= std/mean) of each feature
train_aps_std = pd.DataFrame(train_aps_failure_imputed.std())
train_aps_mean = pd.DataFrame(train_aps_failure_imputed.mean())
train_aps_cv = train_aps_std / train_aps_mean
train_aps_cv = train_aps_cv.sort_values(by = 0, ascending = False)

#pick highest CV
num_aps = int(np.floor(np.sqrt(len(train_aps_failure_imputed.columns) - 1 )))
features_aps = train_aps_cv.index[:num_aps]
cv_highest_aps = pd.DataFrame(train_aps_class).join(train_aps_failure_imputed.loc[ : , features_aps])


In [None]:
#scatterplot of highest CV
plt.figure(figsize = (12, 12))
plot_scatter_cv_aps = sns.pairplot(cv_highest_aps, hue = 'class', palette = 'husl')
plt.savefig('plot_scatter_cv_aps')

In [None]:
#boxplot of highest CV

def draw_boxplot(dataframe, name) :
    num_plots = (len(dataframe.columns) - 1)
    for i in range(1 , len(dataframe.columns)):
        ax = fig.add_subplot(np.ceil(num_plots/3),3,i)
        boxplot = sns.boxplot(data = dataframe, x = "class", y = dataframe.columns[i],
                              hue = "class", palette = "husl", dodge = False)
        plt.xlabel('')
        plt.ylabel('')
        boxplot.legend_.remove()
        plt.title(dataframe.columns[(i - 1)], fontsize = 15)
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(title = "class", handles = handles, labels = labels,
               bbox_to_anchor = [.015, .5], loc = 'center left')
    fig.savefig(name)

fig = plt.figure(figsize = (15,15))
fig.subplots_adjust(hspace = .4, wspace = .4)
draw_boxplot(cv_highest_aps, 'plot_box_cv_aps')


In [None]:
#determining class imbalance
num_neg = len(train_aps_failure_imputed[train_aps_class == 0])
num_pos = len(train_aps_failure_imputed[train_aps_class == 1])

#random forest without compensating for class imbalance
rfc = RandomForestClassifier(oob_score = True)
rfc.fit(train_aps_variables, train_aps_class)
train_aps_predict = rfc.predict(train_aps_variables)


In [None]:
#Confusion matrix_train
train_aps_confusion = metrics.confusion_matrix(train_aps_class, train_aps_predict)

#ROC_train
train_aps_fpr, train_aps_tpr, train_aps_thresholds = roc_curve(train_aps_class, train_aps_predict)

#AUC_train
train_aps_auc = auc(train_aps_fpr, train_aps_tpr)

In [None]:
#plot ROC-AUC curve_train
train_aps_roc_auc = roc_auc_score(train_aps_class, train_aps_predict)
plt.plot([0, 1], [0, 1],'y--')
plt.plot(train_aps_fpr, train_aps_tpr, 'm', label = 'ROC curve (area = %0.3f)' % train_aps_roc_auc)
plt.title('aps_failure_train ROC')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc = 'lower right')
plt.savefig('train_aps_roc_auc.png')

In [None]:
#misclassification
train_aps_misclass = 1 - rfc.score(train_aps_variables, train_aps_class)

#predict test data
test_aps_predict = rfc.predict(test_aps_variables)

#confusion matrix_test
test_aps_confusion = metrics.confusion_matrix(test_aps_class, test_aps_predict)

#ROC_test
test_aps_fpr, test_aps_tpr, test_aps_thresholds = roc_curve(test_aps_class, test_aps_predict)

#AUC_test
test_aps_auc = auc(test_aps_fpr, test_aps_tpr)

In [None]:
#plot ROC-AUC curve_test
test_aps_roc_auc = roc_auc_score(test_aps_class, test_aps_predict)
plt.plot([0, 1], [0, 1],'y--')
plt.plot(test_aps_fpr, test_aps_tpr, 'g', label = 'ROC curve (area = %0.3f)' % test_aps_roc_auc)
plt.title('aps_failure_test ROC')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc = 'lower right')
plt.savefig('test_aps_roc_auc.png')

In [None]:
#misclassification_test
test_aps_misclass = 1 - rfc.score(test_aps_variables, test_aps_class)

#oob error estimate_test
oob_score_imbalanced = rfc.oob_score_

In [None]:
# random forest after compensating for class imbalance
rfc_balanced = RandomForestClassifier(oob_score = True, class_weight = 'balanced')
rfc_balanced.fit(train_aps_variables, train_aps_class)
train_aps_predict_balanced = rfc_balanced.predict(train_aps_variables)

In [None]:
#Confusion matrix_blanaced_train
train_aps_confusion_balanced = metrics.confusion_matrix(train_aps_class, train_aps_predict_balanced)

#ROC_balanced_train
train_aps_fpr_balanced, train_aps_tpr_balanced, train_aps_thresholds_balanced = roc_curve(train_aps_class, train_aps_predict_balanced)

#AUC_balanced_train
train_aps_auc_balanced = auc(train_aps_fpr_balanced, train_aps_tpr_balanced)


In [None]:
#plot ROC-AUC curve_balanced_train
train_aps_roc_auc_balanced = roc_auc_score(train_aps_class, train_aps_predict_balanced)
plt.plot([0, 1], [0, 1],'y--')
plt.plot(train_aps_fpr_balanced, train_aps_tpr_balanced, 'm', label = 'ROC curve (area = %0.3f)' % train_aps_roc_auc_balanced)
plt.title('aps_failure_train_balanced ROC')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc = 'lower right')
plt.savefig('train_aps_roc_auc_balanced.png')


In [None]:
#misclassification_balanced_train
train_aps_misclass_balanced = 1 - rfc_balanced.score(train_aps_variables, train_aps_class)

#predict test data_balanced
test_aps_predict_balanced = rfc_balanced.predict(test_aps_variables)

#confusion matrix_balanced_test
test_aps_confusion_balanced = metrics.confusion_matrix(test_aps_class, test_aps_predict_balanced)

#ROC_balanced_test
test_aps_fpr_balanced, test_aps_tpr_balanced, test_aps_thresholds_balanced = roc_curve(test_aps_class, test_aps_predict_balanced)

#AUC_balanced_test
test_aps_auc_balanced = auc(test_aps_fpr_balanced, test_aps_tpr_balanced)

In [None]:
test_aps_roc_auc_balanced = roc_auc_score(test_aps_class, test_aps_predict_balanced)
plt.plot([0, 1], [0, 1],'y--')
plt.plot(test_aps_fpr_balanced, test_aps_tpr_balanced, 'g', label = 'ROC curve (area = %0.3f)' % test_aps_roc_auc_balanced)
plt.title('aps_failure_test_balanced ROC')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc = 'lower right')
plt.savefig('test_aps_roc_auc_balanced.png')

In [None]:
#misclassification_balanced_test
test_aps_misclass_balanced = 1 - rfc_balanced.score(test_aps_variables, test_aps_class)

#SMOTE
smt = SMOTE()
train_aps_smote_x, train_aps_smote_y = smt.fit_resample(train_aps_variables, train_aps_class)

