In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

In [2]:
def show_frequency(frame,featrue):
    dfr=pd.DataFrame()
    Frequency=frame[featrue].value_counts() 
    total_count=len(frame)
    percentage=Frequency/total_count * 100
    percentage=percentage.round(2)
    dfr['Frequency']=frame[featrue].value_counts() 
    dfr['%']=percentage
    return dfr 

In [3]:
df_train=pd.read_csv('data.csv')
#df_train=df_train.sample(frac=1)
df_test=pd.read_excel('testdata_10%.xlsx')
frames=[df_train,df_test]
#df=pd.concat(frames)


In [4]:
qmark='?'
df_train.replace(qmark,np.NaN,inplace=True)
df_test.replace(qmark,np.NaN,inplace=True)
#df_train=df_train.sample(frac=1)

In [5]:
def null_values(DataFrame_Name):
    
    sum_null = DataFrame_Name.isnull().sum()
    total_count = DataFrame_Name.isnull().count()
    percent_nullvalues = sum_null/total_count * 100
    df_null = pd.DataFrame()
    df_null['Total_values'] = total_count
    df_null['Null_Count'] = sum_null
    df_null['Percent'] = percent_nullvalues
    df_null = df_null.sort_values(by='Null_Count',ascending = False)

    return(df_null)

In [6]:
#--------------------------------Checking for NULL values ------------------------------------------------------

In [7]:
#null_values(df_train)

In [8]:
#null_values(df_test)

In [9]:
# ---------------------------- Checking column data types before filling null values ------------------------------

In [10]:
#df_train.dtypes

In [11]:
#df_test.dtypes

In [12]:
df_train['A2']=df_train['A2'].astype(str).astype(float)
df_test['A2']=df_test['A2'].astype(str).astype(float)

df_train['A14']=df_train['A14'].astype(str).astype(float)
df_test['A14']=df_test['A14'].astype(str).astype(float)

In [13]:
#-------------------------- filling Null values ----------------------------------------------------------

In [14]:
A2_mean=df_train['A2'].mean()
A14_mean=df_train['A14'].mean()

A1_max_occurrence=df_train['A1'].value_counts().index[0]
A6_max_occurrence=df_train['A6'].value_counts().index[0]
A9_max_occurrence=df_train['A9'].value_counts().index[0]
A3_max_occurrence=df_train['A3'].value_counts().index[0]
A4_max_occurrence=df_train['A4'].value_counts().index[0]

df_train=df_train.fillna({'A2':A2_mean,'A14':A14_mean,'A1':A1_max_occurrence,'A6':A6_max_occurrence,'A9':A9_max_occurrence,'A3':A3_max_occurrence,'A4':A4_max_occurrence})
df_test=df_test.fillna({'A1':A1_max_occurrence})


In [15]:
#------------------------- preprocess 1 --------------------------------------------------------------------------

# print(show_frequency(df_test,'A3'))
#    Frequency      %
# u         11  78.57
# y          3  21.43

# print(show_frequency(df_train,'A3'))
#    Frequency      %
# u        420  76.09
# y        130  23.55
# l          2   0.36

# since there are only 2 entries as l in 'A3' of the training set, replace them with U
df_train['A3'] = df_train['A3'].replace('l','u')



In [16]:
#------------------------- preprocess 2 --------------------------------------------------------------------------

# print(show_frequency(df_test,'A4'))
#    Frequency      %
# g         11  78.57
# p          3  21.43

# print(show_frequency(df_train,'A4'))
#     Frequency      %
# g         420  76.09
# p         130  23.55
# gg          2   0.36


# since there are only 2 entries as l in 'A4' of the training set, replace them with U
df_train['A4'] = df_train['A3'].replace('gg','g')

In [17]:
# ----------------------- Encode caragorical data -----------------------------------------------------------

In [18]:
# binary encodings (Without True/False) can be done to ------> A1,A3,A4

# encode A16 lables success =1 failure =0
df_train['A16'] = df_train['A16'].map({label:idx for idx,label in enumerate(np.unique(df_train['A16']))})

# encode A1 lables a =0 b =1 
df_train['A1'] = df_train['A1'].map({label:idx for idx,label in enumerate(np.unique(df_train['A1']))})
df_test['A1'] = df_test['A1'].map({label:idx for idx,label in enumerate(np.unique(df_test['A1']))})

# encode A8 lables false =0 true =1
df_train['A8'] = df_train['A8'].map({label:idx for idx,label in enumerate(np.unique(df_train['A8']))})
df_test['A8'] = df_test['A8'].map({label:idx for idx,label in enumerate(np.unique(df_test['A8']))})

# encode A11 lables false =0 true =1
df_train['A11'] = df_train['A11'].map({label:idx for idx,label in enumerate(np.unique(df_train['A11']))})
df_test['A11'] = df_test['A11'].map({label:idx for idx,label in enumerate(np.unique(df_test['A11']))})

# encode A13 lables false =0 true =1
df_train['A13'] = df_train['A13'].map({label:idx for idx,label in enumerate(np.unique(df_train['A13']))})
df_test['A13'] = df_test['A13'].map({label:idx for idx,label in enumerate(np.unique(df_test['A13']))})

In [19]:
#df_test.head()

In [20]:
x_A3=df_train['A3'].map({label:idx for idx,label in enumerate(np.unique(df_train['A3']))})
x_A4=df_train['A4'].map({label:idx for idx,label in enumerate(np.unique(df_train['A4']))})
x_A6=pd.get_dummies(df_train[['A6']])
x_A9=pd.get_dummies(df_train[['A9']])
x_A15=pd.get_dummies(df_train[['A15']])

df_train=pd.concat([df_train['A1'],df_train['A2'],x_A3,x_A4,df_train['A5'],x_A6,df_train['A7'],df_train['A8'],x_A9,df_train['A10'],df_train['A11'],df_train['A12'],df_train['A13'],df_train['A14'],x_A15,df_train['A16']],axis=1)
#df_train.head()

In [21]:
# for col in df_train.columns: 
#     print(col) 

In [22]:
x_A3=df_test['A3'].map({label:idx for idx,label in enumerate(np.unique(df_test['A3']))})
x_A4=df_test['A4'].map({label:idx for idx,label in enumerate(np.unique(df_test['A4']))})
x_A6=pd.get_dummies(df_test[['A6']])
x_A9=pd.get_dummies(df_test[['A9']])
x_A15=pd.get_dummies(df_test[['A15']])

df_test=pd.concat([df_test['A1'],df_test['A2'],x_A3,x_A4,df_test['A5'],x_A6,df_test['A7'],df_test['A8'],x_A9,df_test['A10'],df_test['A11'],df_test['A12'],df_test['A13'],df_test['A14'],x_A15],axis=1)

#df_test.head()

In [23]:
# for col in df_train.columns: 
#     print(col,end=' ,') 
# print('-------------------')
# for col in df_test.columns: 
#     print(col,end=' ,') 

In [24]:
def fill_missing_columns(df_train,df_test):
    col_train=list(df_train)
    col_test=list(df_test)
    size=len(col_train)
   
    for i in range(size):
        col=col_train[i]
        if col not in col_test:
            df_test.insert(i,col,0)
    
            
fill_missing_columns(df_train,df_test)

In [25]:
# for col in df_train.columns: 
#     print(col,end=' ,') 
# print('-------------------')
# for col in df_test.columns: 
#     print(col,end=' ,') 

In [26]:
df_test=df_test.drop(['A16'], axis=1)
 

In [27]:
feature_cols=df_train.columns
feature_cols=feature_cols[0:len(feature_cols)-1]

#split dataset in features and target variable
X = df_train[feature_cols] # Features
y = df_train.A16 # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

# X_train = df_train[feature_cols] # Features
# y_train = df_train.A16 # Target variable

In [28]:
#----- Feature scaling-------------------
#-------------- Feature scaling --------------------------------------------
# random forest, decision trees do not need feature scaling

stdsc = StandardScaler()
X_train = stdsc.fit_transform(X_train)
X_test = stdsc.transform(X_test)
df_test=stdsc.transform(df_test)


In [29]:
#-------------------apply different modles -----------------------

In [30]:
# Create Decision Tree classifer object
dt = DecisionTreeClassifier()
dt = dt.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = dt.predict(X_test)
dt_accuracy=metrics.accuracy_score(y_test, y_pred)
dt_tree_precision=metrics.precision_score(y_test, y_pred)
dt_tree_recall=metrics.recall_score(y_test, y_pred)
print('Accuracy= ',metrics.accuracy_score(y_test, y_pred))
print('Precision= ',metrics.precision_score(y_test, y_pred))
print('Recall= ',metrics.recall_score(y_test, y_pred))

Accuracy=  0.8072289156626506
Precision=  0.7875
Recall=  0.8076923076923077


In [31]:
#Create a Gaussian Classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = gnb.predict(X_test)
gnb_accuracy=metrics.accuracy_score(y_test, y_pred)
gnb_precision=metrics.precision_score(y_test, y_pred)
gnb_recall=metrics.recall_score(y_test, y_pred)
print('Accuracy= ',metrics.accuracy_score(y_test, y_pred))
print('Precision= ',metrics.precision_score(y_test, y_pred))
print('Recall= ',metrics.recall_score(y_test, y_pred))

Accuracy=  0.6807228915662651
Precision=  0.7777777777777778
Recall=  0.44871794871794873


In [37]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = knn.predict(X_test)
knn_accuracy=metrics.accuracy_score(y_test, y_pred)
knn_precision=metrics.precision_score(y_test, y_pred)
knn_recall=metrics.recall_score(y_test, y_pred)
print('Accuracy= ',metrics.accuracy_score(y_test, y_pred))
print('Precision= ',metrics.precision_score(y_test, y_pred))
print('Recall= ',metrics.recall_score(y_test, y_pred))

Accuracy=  0.8012048192771084
Precision=  0.8082191780821918
Recall=  0.7564102564102564


In [33]:
svm = svm.SVC(kernel='linear') # Linear Kernel
svm.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = svm.predict(X_test)
svm_accuracy=metrics.accuracy_score(y_test, y_pred)
svm_precision=metrics.precision_score(y_test, y_pred)
svm_recall=metrics.recall_score(y_test, y_pred)
print('Accuracy= ',metrics.accuracy_score(y_test, y_pred))
print('Precision= ',metrics.precision_score(y_test, y_pred))
print('Recall= ',metrics.recall_score(y_test, y_pred))

Accuracy=  0.8554216867469879
Precision=  0.8375
Recall=  0.8589743589743589


In [34]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = lr.predict(X_test)
lr_accuracy=metrics.accuracy_score(y_test, y_pred)
lr_precision=metrics.precision_score(y_test, y_pred)
lr_recall=metrics.recall_score(y_test, y_pred)
print('Accuracy= ',metrics.accuracy_score(y_test, y_pred))
print('Precision= ',metrics.precision_score(y_test, y_pred))
print('Recall= ',metrics.recall_score(y_test, y_pred))

Accuracy=  0.8734939759036144
Precision=  0.8518518518518519
Recall=  0.8846153846153846


In [39]:
rf=RandomForestClassifier()
rf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = rf.predict(X_test)
rf_accuracy=metrics.accuracy_score(y_test, y_pred)
rf_precision=metrics.precision_score(y_test, y_pred)
rf_recall=metrics.recall_score(y_test, y_pred)
print('Accuracy= ',metrics.accuracy_score(y_test, y_pred))
print('Precision= ',metrics.precision_score(y_test, y_pred))
print('Recall= ',metrics.recall_score(y_test, y_pred))

Accuracy=  0.9036144578313253
Precision=  0.9305555555555556
Recall=  0.8589743589743589


In [40]:
#--------------------------------------------reuslts------------------------------------------

In [42]:
output=rf.predict(df_test)
print(output)

[0 0 0 0 0 0 0 0 0 0 1 1 1 1]
