In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import matplotlib
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_train_data = pd.read_csv('ida_2016_training_set_update.csv', sep=',', header=0, na_values="na")

In [3]:
total_num_data = len(df_train_data.index)
print("Number of attributes = ", len(df_train_data.columns))
print("Number of data = ", total_num_data)
print("*******************")

# Display all column headers
print("----- Column Headers -----")
print(df_train_data.columns.values)

# Display the first n rows
print("----- Display top rows -----")
print(df_train_data.head(n=5))
#
# Describe the statistics of the data-set
print("----- Data-Set Statistics -----")
print(df_train_data.describe(include="all"))

# Print number of positive classes and number of negative classes in the training data-set
print("Number of positive classes = ", sum(df_train_data['class'] == 'pos'))
print("Number of negative classes = ", sum(df_train_data['class'] == 'neg'))
print("*******************")

# Replace class labels with integer values (neg = 0, pos = 1) in training and test data-set
df_train_data['class'].replace({
    'neg': 0,
    'pos': 1
}, inplace=True)


# Compute the percentage of missing data for each attribute in the training data set
missing_percent_threshold = 0.70
missing_data_count = pd.DataFrame(df_train_data.isnull().sum().sort_values(ascending=False), columns=['Number'])
missing_data_percent = pd.DataFrame(df_train_data.isnull().sum().sort_values(ascending=False)/total_num_data, columns=['Percent'])
missing_data = pd.concat([missing_data_count, missing_data_percent], axis=1)
# print(missing_data)
missing_column_headers = missing_data[missing_data['Percent'] > missing_percent_threshold].index
print(missing_column_headers)

# Drop the features with high amount of missing data in both train and test data-set
df_train_data = df_train_data.drop(columns=missing_column_headers)
print("Training data-set shape after dropping features is ", df_train_data.shape)
print(df_train_data.describe())

# Extract features and labels from the training and test data-set
y_train = df_train_data.loc[:, 'class']
x_train = df_train_data.drop('class', axis=1)

corrmat = x_train.corr()
sns.heatmap(corrmat, vmax=.8, square=True);
plt.show()

# Fill missing data in training and test data-set
imputer_median = SimpleImputer(strategy='median')
imputer_median.fit(x_train.values)
x_train = imputer_median.transform(x_train.values)


# Standardize the training and test data-set
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)


#  Synthetic Minority Oversampling Technique to balance the training data-set
sm = SMOTE()
x_train, y_train = sm.fit_sample(x_train, y_train)

# Principal Component Analysis
pca = PCA(n_components=0.75)
pca.fit(x_train)
x_train = pca.transform(x_train)


corrmat_pca = pd.DataFrame(x_train).corr()
sns.heatmap(corrmat_pca, vmax=.8, square=True);
plt.show()

Number of attributes =  171
Number of data =  60000
*******************
----- Column Headers -----
['class' 'aa_000' 'ab_000' 'ac_000' 'ad_000' 'ae_000' 'af_000' 'ag_000'
 'ag_001' 'ag_002' 'ag_003' 'ag_004' 'ag_005' 'ag_006' 'ag_007' 'ag_008'
 'ag_009' 'ah_000' 'ai_000' 'aj_000' 'ak_000' 'al_000' 'am_0' 'an_000'
 'ao_000' 'ap_000' 'aq_000' 'ar_000' 'as_000' 'at_000' 'au_000' 'av_000'
 'ax_000' 'ay_000' 'ay_001' 'ay_002' 'ay_003' 'ay_004' 'ay_005' 'ay_006'
 'ay_007' 'ay_008' 'ay_009' 'az_000' 'az_001' 'az_002' 'az_003' 'az_004'
 'az_005' 'az_006' 'az_007' 'az_008' 'az_009' 'ba_000' 'ba_001' 'ba_002'
 'ba_003' 'ba_004' 'ba_005' 'ba_006' 'ba_007' 'ba_008' 'ba_009' 'bb_000'
 'bc_000' 'bd_000' 'be_000' 'bf_000' 'bg_000' 'bh_000' 'bi_000' 'bj_000'
 'bk_000' 'bl_000' 'bm_000' 'bn_000' 'bo_000' 'bp_000' 'bq_000' 'br_000'
 'bs_000' 'bt_000' 'bu_000' 'bv_000' 'bx_000' 'by_000' 'bz_000' 'ca_000'
 'cb_000' 'cc_000' 'cd_000' 'ce_000' 'cf_000' 'cg_000' 'ch_000' 'ci_000'
 'cj_000' 'ck_000' 'cl_000' 

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=42)

In [13]:
from sklearn.svm import SVC
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [14]:
y_pred = svclassifier.predict(X_test)

In [15]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(Y_test,y_pred))
print(classification_report(Y_test,y_pred))

[[18928   672]
 [ 1571 17769]]
              precision    recall  f1-score   support

           0       0.92      0.97      0.94     19600
           1       0.96      0.92      0.94     19340

    accuracy                           0.94     38940
   macro avg       0.94      0.94      0.94     38940
weighted avg       0.94      0.94      0.94     38940



In [16]:
svclassifier = SVC(kernel='poly', degree=3)
svclassifier.fit(X_train, Y_train)
y_pred = svclassifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(Y_test,y_pred))
print(classification_report(Y_test,y_pred))


[[18302  1298]
 [  924 18416]]
              precision    recall  f1-score   support

           0       0.95      0.93      0.94     19600
           1       0.93      0.95      0.94     19340

    accuracy                           0.94     38940
   macro avg       0.94      0.94      0.94     38940
weighted avg       0.94      0.94      0.94     38940



In [17]:
svclassifier = SVC(kernel='rbf')
svclassifier.fit(X_train, Y_train)
y_pred = svclassifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(Y_test,y_pred))
print(classification_report(Y_test,y_pred))

[[18841   759]
 [ 1184 18156]]
              precision    recall  f1-score   support

           0       0.94      0.96      0.95     19600
           1       0.96      0.94      0.95     19340

    accuracy                           0.95     38940
   macro avg       0.95      0.95      0.95     38940
weighted avg       0.95      0.95      0.95     38940



In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,Y_train)

# prediction on test set
y_pred=clf.predict(X_test)

#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))

Accuracy: 0.9871597329224447


In [None]:
svc_disp = plot_roc_curve(svclassifier, X_test, Y_test)
plt.show()
ax = plt.gca()
rfc_disp = plot_roc_curve(clf, X_test, Y_test, ax=ax, alpha=0.8)
svc_disp.plot(ax=ax, alpha=0.8)
plt.show()
