In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, \
f1_score, roc_auc_score, roc_curve, precision_score, recall_score


In [2]:
hold_outdata = pd.read_csv("SCC Dataset.csv")

In [3]:
hold_outdata = hold_outdata.sample(frac=0.4)

In [8]:
hold_outdata.target.value_counts()
target = 'target'
labels = ['Claimed','Not Claimed']
features = [i for i in hold_outdata.columns.values if i not in [target,'id']]

In [13]:
cat=[]
non_cat = []
for feature in features:
    if 'cat' in feature:
        cat.append(feature)
    else:
        non_cat.append(feature)
print('\n\033[1mInference:\033[0m The Datset has {} numerical & {} categorical features.'.format(len(cat),len(non_cat)))


[1mInference:[0m The Datset has 13 numerical & 6 categorical features.


In [15]:
for cat_v in cat:
    hold_outdata.loc[hold_outdata[cat_v] == -1, cat_v] = hold_outdata[cat_v].mode()

In [16]:
for num in non_cat:
    hold_outdata.loc[hold_outdata[num] == -1, num] = hold_outdata[num].mean()

In [18]:
null_value_check = pd.DataFrame(hold_outdata.isnull().sum().sort_values(), columns=['Total Null Values'])
null_value_check['Percentage'] = round(null_value_check['Total Null Values']/hold_outdata.shape[0],3)*100

In [19]:
ecc = null_value_check[null_value_check['Percentage']!=0].index.values
dcc = [i for i in hold_outdata.columns if i not in ecc]

df3 = hold_outdata[dcc]
fcc = [i for i in cat if i not in ecc]

#One-Hot Binay Encoding
oh=True
dm=True
for i in fcc:
    #print(i)
    if df3[i].nunique()==2:
        if oh==True: print("\033[1m\nOne-Hot Encoding on features:\033[0m")
        print(i);oh=False
        df3[i]=pd.get_dummies(df3[i], drop_first=True, prefix=str(i))
    if (df3[i].nunique()>2 and df3[i].nunique()<17):
        if dm==True: print("\n\033[1mDummy Encoding on features:\033[0m")
        print(i);dm=False
        df3 = pd.concat([df3.drop([i], axis=1), pd.DataFrame(pd.get_dummies(df3[i], drop_first=True, prefix=str(i)))],axis=1)
        
df3.shape


[1mDummy Encoding on features:[0m
x_01_cat
[1m
One-Hot Encoding on features:[0m
x_02_cat
x_04_cat
x_05_cat
x_06_cat
x_07_cat
x_12_cat
x_13_cat
x_17_cat


(32000, 36)

In [21]:
df4 = df3.copy()

for i in [i for i in df4.columns]:
    if df4[i].nunique()>=12:
        Q1 = df4[i].quantile(0.25)
        Q3 = df4[i].quantile(0.75)
        IQR = Q3 - Q1
        df4 = df4[df4[i] <= (Q3+(1.5*IQR))]
        df4 = df4[df4[i] >= (Q1-(1.5*IQR))]
df4 = df4.reset_index(drop=True)
display(df4.head())
print('\n\033[1mInference:\033[0m Before removal of outliers, The dataset had {} samples.'.format(hold_outdata.shape[0]))
print('\033[1mInference:\033[0m After removal of outliers, The dataset now has {} samples.'.format(df4.shape[0]))

Unnamed: 0,id,target,x_02_cat,x_04_cat,x_06_cat,x_07_cat,x_08,x_09,x_10,x_11,...,x_05_cat_4,x_05_cat_5,x_05_cat_6,x_05_cat_7,x_05_cat_8,x_05_cat_9,x_05_cat_10,x_05_cat_11,x_05_cat_12,x_05_cat_13
0,151932,0,0,0,0,1,0.7,0.4,0.701338,0.2,...,0,0,1,0,0,0,0,0,0,0
1,24462,0,0,0,0,0,0.9,0.4,0.795692,0.5,...,0,0,0,0,0,0,0,0,0,0
2,580,0,0,0,1,0,0.1,0.2,0.548199,0.4,...,1,0,0,0,0,0,0,0,0,0
3,196335,0,0,0,1,0,0.1,0.3,0.548199,0.5,...,0,0,0,0,0,0,0,0,0,1
4,138989,0,0,0,1,0,0.8,0.5,1.059481,0.2,...,0,0,0,0,0,0,0,0,0,1



[1mInference:[0m Before removal of outliers, The dataset had 32000 samples.
[1mInference:[0m After removal of outliers, The dataset now has 27681 samples.


In [23]:
X = df4.drop([target,'id'],axis=1)
Y = df4[target]

In [35]:


print('\033[1mStandardardization on Training set'.center(100))
Train_X_std = scaler.transform(X)
Train_X_std = pd.DataFrame(Train_X_std, columns=X.columns)
display(Train_X_std.describe())

                               [1mStandardardization on Training set                               


Unnamed: 0,x_02_cat,x_04_cat,x_06_cat,x_07_cat,x_08,x_09,x_10,x_11,x_12_cat,x_13_cat,...,x_05_cat_4,x_05_cat_5,x_05_cat_6,x_05_cat_7,x_05_cat_8,x_05_cat_9,x_05_cat_10,x_05_cat_11,x_05_cat_12,x_05_cat_13
count,27681.0,27681.0,27681.0,27681.0,27681.0,27681.0,27681.0,27681.0,27681.0,27681.0,...,27681.0,27681.0,27681.0,27681.0,27681.0,27681.0,27681.0,27681.0,27681.0,27681.0
mean,0.250441,0.325256,0.280324,0.16892,-0.07925,-0.072776,-0.086586,0.002028,0.217013,0.152213,...,0.163918,0.173968,0.204197,0.169355,0.171991,0.17958,0.17892,0.189096,0.17897,0.147529
std,1.16843,1.09898,0.94781,1.233924,1.100717,1.059511,1.083422,1.08903,0.809911,1.353699,...,1.316251,1.341915,1.327252,1.257468,1.278851,1.337726,1.30138,1.321938,1.334603,1.328205
min,-0.460933,-0.628137,-1.042065,-0.28214,-2.343604,-1.356837,-3.063093,-1.872631,-1.89591,-0.172602,...,-0.206654,-0.200913,-0.239623,-0.259062,-0.243326,-0.209002,-0.233193,-0.228797,-0.210361,-0.181549
25%,-0.460933,-0.628137,-1.042065,-0.28214,-0.818386,-0.63657,-1.005537,-0.74958,0.527451,-0.172602,...,-0.206654,-0.200913,-0.239623,-0.259062,-0.243326,-0.209002,-0.233193,-0.228797,-0.210361,-0.181549
50%,-0.460933,-0.628137,0.959633,-0.28214,0.325527,-0.276437,-0.40626,-0.00088,0.527451,-0.172602,...,-0.206654,-0.200913,-0.239623,-0.259062,-0.243326,-0.209002,-0.233193,-0.228797,-0.210361,-0.181549
75%,2.169515,1.592008,0.959633,-0.28214,1.088135,0.443831,0.536219,1.122171,0.527451,-0.172602,...,-0.206654,-0.200913,-0.239623,-0.259062,-0.243326,-0.209002,-0.233193,-0.228797,-0.210361,-0.181549
max,2.169515,1.592008,0.959633,3.544335,1.088135,2.964766,3.171907,1.870871,0.527451,5.793688,...,4.838995,4.977277,4.173227,3.860081,4.109713,4.784653,4.288294,4.370683,4.753742,5.508155


In [25]:
   def loadPickle(path):
        with open(path, "rb") as f:
            return pickle.load(f)


In [39]:
rf = loadPickle("RandomForest.pkl")

In [34]:
scaler= loadPickle('scaler.pkl')

In [28]:
import pickle

In [40]:
pred=rf.predict(Train_X_std)

In [41]:
df4['pred'] = pred

In [42]:
print('Accuracy = {}%'.format(round(accuracy_score(Y.values, pred),3)*100))
print('F1 Score = {}%'.format(round(f1_score(Y.values, pred, average='weighted'),3)*100)) #
print('\n \033[1mConfusiton Matrix:\033[0m\n',confusion_matrix(Y.values, pred))
print('\n\033[1mClassification Report:\033[0m\n',classification_report(Y.values, pred))

Accuracy = 97.8%
F1 Score = 97.8%

 [1mConfusiton Matrix:[0m
 [[26387   381]
 [  238   675]]

[1mClassification Report:[0m
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     26768
           1       0.64      0.74      0.69       913

    accuracy                           0.98     27681
   macro avg       0.82      0.86      0.84     27681
weighted avg       0.98      0.98      0.98     27681



In [43]:
hold_outdata.to_csv("testdata.csv")