In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('../input/preassesment/train.csv')
test = pd.read_csv('../input/preassesment/test.csv')

Information of data

In [3]:
train.info()

In [4]:
test.info()

Statistical Info of data

In [5]:
train.describe()

In [6]:
test.describe()

's53','s56', 's57', 's59' has no value, so are dropped

In [7]:
train = train.drop(['s56', 's57', 's59', 's53'], axis=1)
test = test.drop(['s56', 's57', 's59', 's53'], axis=1)

90.7% data are missing in 's54' column, so Chi-Square test is performed to check realtionship between 's54' and output 'label' 

In [8]:
# Cross tabulation between 's54' and 'label'
CrosstabResult=pd.crosstab(index=train['s54'],columns=train['label'])
print(CrosstabResult)

In [9]:
# importing the required function
from scipy.stats import chi2_contingency
 
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)
 
# P-Value is the Probability of H0 being True
# If P-Value is greater than 0.05 then only we Accept the assumption(H0)
 
print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

 **Ho: 's4' and 'label' are independent.**
 **Ha: 's4' and 'label' are related.**
 **P-value:0.040859**;
 **so, 's4' and 'label' are related.**
 **That's why s54 can't be dropped**

In [10]:
train['s54'].fillna(train['s54'].mode()[0], inplace=True)
test['s54'].fillna(test['s54'].mode()[0], inplace=True)
#train_2['s54'] = train_2['s54'].ffill(axis = 0)
#train_2['s54'] = train_2['s54'].bfill(axis = 0)

In [11]:
train.isnull().sum()

In [12]:
test.isnull().sum()

In [13]:
train['s55'].unique()

88.7% data are missing in 's55' column, so Chi-Square test is performed to check realtionship between 's55' and output 'label' 

In [14]:
# Cross tabulation between 's55' and 'label'
CrosstabResult_2=pd.crosstab(index=train['s55'],columns=train['label'])
print(CrosstabResult_2)

In [15]:
# importing the required function
from scipy.stats import chi2_contingency
 
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult_2)
 
# P-Value is the Probability of H0 being True
# If P-Value&gt;0.05 then only we Accept the assumption(H0)
 
print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

so, 's55' should be dropped

In [16]:
train = train.drop('s55', axis=1)
test = test.drop('s55', axis=1)

In [17]:
train.isnull().sum()

In [18]:
test.isnull().sum()

In [19]:
train['s52'].replace(['l','o'],[1,0], inplace=True)

In [20]:
test['s52'].replace(['l','o'],[1,0], inplace=True)

In [21]:
train['s52'].unique()

In [22]:
test['s52'].unique()

In [23]:
train['s52'] = pd.to_numeric(train['s52'])

In [24]:
test['s52'] = pd.to_numeric(test['s52'])

In [25]:
train['s52'].unique()

In [26]:
test['s52'].unique()

**Correlation**

In [27]:
import seaborn as sns
import matplotlib.pyplot as plt

In [28]:
plt.figure(figsize=(15,12))
sns.heatmap(train.corr(), annot=True)

**n1 and n2 are highly correlated, so one should be dropped. **
**Correlation of n1 and label : 0.14 .**
**Correlation of n2 and label : 0.12 .**
**So, n2 is dropped.**

In [29]:
train.drop('n2', axis=1, inplace=True)
test.drop('n2', axis=1, inplace=True)

In [30]:
train.sample(10)

In [31]:
test.sample(10)

**n7 and n10 are highly correlated, so one should be dropped. **
**Correlation of n7 and label : -0.034 .**
**Correlation of n10 and label : -0.047 .**
**So, n7 is dropped.**

In [32]:
train.drop('n7', axis=1, inplace=True)

In [33]:
test.drop('n7', axis=1, inplace=True)

'id' is not effective in building model, so is dropped

In [34]:
train.drop('id', axis=1, inplace=True)

In [35]:
train.sample(10)

In [36]:
from pandas.core.dtypes.common import is_numeric_dtype
cat_list = []
num_list = []

for col in train.columns:
    if is_numeric_dtype(train[col]):
        num_list.append(col)
    else:
        cat_list.append(col)

In [37]:
cat_list

In [38]:
num_list

In [39]:
len(cat_list)

In [40]:
len(num_list)

Coverting Categorical Data to numeric

In [41]:
for col in cat_list:
    one = pd.get_dummies(train[col],drop_first=True,prefix=col)#to avoid dummy varibale trap i.e. multi-colinearity
    train = pd.concat([train,one],axis=1).drop(col,axis=1)

In [42]:
train.head()

In [43]:
for col in cat_list:
    one = pd.get_dummies(test[col],drop_first=True,prefix=col)#to avoid dummy varibale trap i.e. multi-colinearity
    test = pd.concat([test,one],axis=1).drop(col,axis=1)

In [44]:
test.head()

Conversion of continuous number into same scale

In [45]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()

In [46]:
num_to_std = ['n1','n3','n4','n5','n6','n10','n8','n9','n11','n12','n14','n15']

In [47]:
for col in num_to_std:
    train[col] = std.fit_transform(train[[col]])

In [48]:
train.head()

In [49]:
for col in num_to_std:
    test[col] = std.fit_transform(test[[col]])

In [50]:
from sklearn.model_selection import train_test_split

Separating Features and target

In [51]:
x = train.drop('label',axis=1)
y = train['label']

In [52]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.30, random_state=42)

In [53]:
from sklearn.metrics import classification_report, confusion_matrix, plot_roc_curve

In [54]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)
print(classification_report(y_test, y_pred_lr))
plot_roc_curve(lr, x_test, y_test)

In [55]:
from sklearn.svm import SVC
svm_li = SVC(kernel='linear')
svm_li.fit(x_train, y_train)
y_pred_svm_li = svm_li.predict(x_test)
print(classification_report(y_test, y_pred_svm_li))
plot_roc_curve(svm_li, x_test, y_test)

In [56]:
svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(x_train, y_train)
y_pred_rbf = svm_rbf.predict(x_test)
print(classification_report(y_test, y_pred_rbf))
plot_roc_curve(svm_rbf, x_test, y_test)

In [57]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
y_pred_dt = dt.predict(x_test)
print(classification_report(y_test, y_pred_dt))
plot_roc_curve(dt, x_test, y_test)

In [58]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pred_knn = knn.predict(x_test)
print(classification_report(y_test, y_pred_knn))
plot_roc_curve(knn, x_test, y_test)

In [59]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(x_train, y_train)
y_pred_rfc = rfc.predict(x_test)
print(classification_report(y_test, y_pred_rfc))
plot_roc_curve(rfc, x_test, y_test)

In [60]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, base_score=0.2, booster='dart', colsample_bytree=0.7,learning_rate=0.1, n_estimators=200)
xgb_model.fit(x_train, y_train)
y_pred_xgb = xgb_model.predict(x_test)
print(classification_report(y_test, y_pred_xgb))
plot_roc_curve(xgb_model, x_test, y_test)

In [61]:
from sklearn.ensemble import ExtraTreesClassifier
ext = ExtraTreesClassifier(n_estimators=200, random_state=42)
ext.fit(x_train, y_train)
y_pred_ext = ext.predict(x_test)
print(classification_report(y_test, y_pred_ext))
plot_roc_curve(ext, x_test, y_test)

In [62]:
from catboost import CatBoostClassifier
cat_boost_model = CatBoostClassifier(learning_rate=0.1,
                                                 n_estimators=200,
                                                 subsample=0.08,
                                                 max_depth=5,
                                                 scale_pos_weight=1.8)
cat_boost_model.fit(x_train, y_train)
y_pred_cat= cat_boost_model.predict(x_test)
print(classification_report(y_test, y_pred_cat))
plot_roc_curve(cat_boost_model, x_test, y_test)

In [65]:
df = test.copy()

In [66]:
df.head()

In [67]:
df.drop('id', axis=1, inplace=True)

# CatBoost is more Robust, it's auc score is more than other model, it's f1 score is more than other model.

In [72]:
test_pred  = cat_boost_model.predict(df)

In [73]:
submission = pd.DataFrame(test['id'],columns=['id'])

In [74]:
submission['label'] = test_pred
submission.sample(10)
submission.to_csv('submission_Anonymous Five_2u27pl.csv', index=False)