In [None]:
pip install mljar-supervised

In [None]:
import numpy as np
import pandas as pd
import missingno as msno

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
import graphviz
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
final = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')
train.head()

# Understanding Data

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
missing_percentages = (train[train.columns].isnull().sum() / train.shape[0]) * 100
missing_percentages

In [None]:
missing_percentages_test = (test[test.columns].isnull().sum() / test.shape[0]) * 100
missing_percentages_test

### The columns with missing values are-

* Age
* Ticket
* Fare
* Cabin
* Embarked

# EDA

In [None]:
for i in train.columns:
    print("The number of unique values in {} is {}".format(i, len(train[i].unique())))

### The categorical features are
* Pclass
* Sex
* Embarked
* Parch
* SibSp

### The continuous features are
* Age
* Fare

### The ones which will be dealt manually are
* Ticket
* Cabin
* Name

In [None]:
categorical_features = ["Pclass","Sex","Embarked","Parch","SibSp"]
continuous_features = ["Age","Fare"]

## Checking the count & distribution of Survived


In [None]:
fig = plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
ax = sns.countplot(x="Survived",data=train)
plt.subplot(1,2,2)
sns.distplot(train.loc[: ,'Survived'], hist_kws={"color":"r"}, kde_kws={"color":"b", "lw":2})
plt.show()

## Pclass

In [None]:
fig = plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
ax = sns.countplot(x="Pclass",data=train)
plt.subplot(1,2,2)
sns.distplot(train.loc[: ,"Pclass"], hist_kws={"color":"r"}, kde_kws={"color":"b", "lw":2})
plt.show()

## Parch

In [None]:
fig = plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
ax = sns.countplot(x="Parch",data=train)
plt.subplot(1,2,2)
sns.distplot(train.loc[: ,"Parch"], hist_kws={"color":"r"}, kde_kws={"color":"b", "lw":2})
plt.show()

## SibSp

In [None]:
fig = plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
ax = sns.countplot(x="SibSp",data=train)
plt.subplot(1,2,2)
sns.distplot(train.loc[: ,"SibSp"], hist_kws={"color":"r"}, kde_kws={"color":"b", "lw":2})
plt.show()

## Sex

In [None]:
fig = plt.figure(figsize=(6,6))
sns.set_palette(["#8072fa","orange"])
ax = sns.countplot(x="Sex",data=train)
plt.show()

## Embarked

In [None]:
fig = plt.figure(figsize=(6,6))
sns.set_palette(["#8072fa","orange","Red"])
ax = sns.countplot(x="Embarked",data=train)
plt.show()

# Data Preprocessing

Handling missing data

In [None]:
train['Age'].fillna(train['Age'].mean(),inplace=True)
test['Age'].fillna(train['Age'].mean(),inplace=True)

train['Fare'].fillna(train['Fare'].mean(),inplace=True)
test['Fare'].fillna(train['Fare'].mean(),inplace=True)

train['Embarked'].fillna(train['Embarked'].mode()[0],inplace=True)
test['Embarked'].fillna(train['Embarked'].mode()[0],inplace=True)

Ticket number,Cabin & Name number doesn't seem to influence Survival.

In [None]:
train.drop(['Name','Ticket','Cabin','PassengerId'], axis=1, inplace=True)
test.drop(['Name','Ticket','Cabin','PassengerId'], axis=1, inplace=True)

Applying a log function to reduce the influence of outliers in Fare column. Since there are a large number of outliers, removing them will lead to loss of a large numberof points.

In [None]:
train['Fare'] = train['Fare'].map(lambda i: np.log(i) if i > 0 else 0)
test['Fare'] = test['Fare'].map(lambda i: np.log(i) if i > 0 else 0)

Adding Parch and SibSp into one feature

In [None]:
train["relatives"] = train["Parch"] + train["SibSp"] + 1
test["relatives"] = test["Parch"] + test["SibSp"] + 1

# Modelling

Label Encoding the categorical features

In [None]:
object_cols = ['Sex','Embarked']
for col in object_cols:
    label_encoder = LabelEncoder()
    label_encoder.fit(train[col])
    train[col] = label_encoder.transform(train[col])
    test[col] = label_encoder.transform(test[col])

In [None]:
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','relatives']
target = train['Survived'].values

## Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(train[features], target)
print("Logistic Regression ROC AUC score:", roc_auc_score(target, lr.predict_proba(train[features])[:,1]))
print('Logistic Regression Accuracy score:', accuracy_score(target, lr.predict(train[features])))

In [None]:
final['Survived'] = lr.predict(test[features])
final.to_csv('LR.csv',index=False)

## XGBoost

In [None]:
xgb = XGBClassifier()
xgb.fit(train[features], target)
print("XGB ROC AUC score:", roc_auc_score(target, xgb.predict_proba(train[features])[:,1]))
print('XGB Accuracy score:', accuracy_score(target, xgb.predict(train[features])))

In [None]:
final['Survived'] = xgb.predict(test[features])
final.to_csv('XGB.csv',index=False)

# AutoML

In [None]:
from supervised.automl import AutoML
automl = AutoML(eval_metric="accuracy")
automl.fit(train[features], target)
automl.report()

In [None]:
final['Survived'] = automl.predict(test[features])
final.to_csv('AutoML.csv',index=False)

# Decision Tree

In [None]:
dt = DecisionTreeClassifier(
    max_depth=4,
    min_samples_leaf=2)
dt.fit(train[features], target)

In [None]:
dot_data = export_graphviz(
    dt,
    out_file=None,
    feature_names=train[features].columns,
    class_names=['0', '1'],
    filled=True,
    rounded=False,
    special_characters=True,
    precision=3
)
graph = graphviz.Source(dot_data)
graph 

In [None]:
y_pred = dt.predict(test[features]).astype(int)
final['Survived'] = y_pred
final.to_csv("DT.csv", index=False)

# LGBM

In [None]:
lgbm = LGBMClassifier(boosting_type = 'dart',num_leaves = 32,max_depth = 10,colsample_bytree = 0.8,extra_trees = True,n_jobs = -1,random_state = 42)
lgbm.fit(train[features], target)

In [None]:
y_pred = lgbm.predict(test[features]).astype(int)
final['Survived'] = y_pred
final.to_csv("LGBM.csv", index=False)

# Extra Classifier

In [None]:
ext = ExtraTreesClassifier(n_estimators = 1000,max_depth = 17,min_samples_split = 25,min_samples_leaf = 18,n_jobs = -1,random_state = 42)
ext.fit(train[features], target)

In [None]:
y_pred = ext.predict(test[features]).astype(int)
final['Survived'] = y_pred
final.to_csv("EXT.csv", index=False)

# Gradient Boosting

In [None]:
gb=GradientBoostingClassifier(max_depth= 2, n_estimators = 400)
gb.fit(train[features], target)

In [None]:
y_pred = gb.predict(test[features]).astype(int)
final['Survived'] = y_pred
final.to_csv("GB.csv", index=False)

# Voting CLassifier

In [None]:
clf = VotingClassifier(estimators=[('DT',dt),('EXT',ext),('LGBM' , lgbm) , ('GB',gb)], voting='soft')
clf.fit(train[features], target)

In [None]:
y_pred = clf.predict(test[features]).astype(int)
final['Survived'] = y_pred
final.to_csv("VC.csv", index=False)