In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as ml
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
%matplotlib inline
ml.style.use('ggplot')

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_curve, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
td = pd.read_csv('/kaggle/input/titanic/train.csv')
print(td.shape)
td.head(10)

# Data Exploration
*  Describing the data
*  Finding missing values
*  Quality, completeness and tidiness issues
    -  Search if any data type is falsely assigned.
*  Cocluding relations between features. (heatmap)
    -  Distplot for numerical data
    -  Countplot for categorical data
*  Outlier detection

In [None]:
td.describe()

Let us see what columns we have and assert their data types 
*  If any datatype doesn't match ur criteria. If that is the case, try modifying it to the datatype it should be in.

In [None]:
td.columns

#### There are 4 categorical columns :
*  Survived (NOM)
*  PClass (ORD)
*  Sex (NOM)
*  Embarked (NOM)

#### There are 8 numerical columns :
*  PassengerId
*  Name
*  Age
*  SibSp'
*  Parch
*  Ticket
*  Fare
*  Cabin

Let's check for null values and data types.

In [None]:
td.info()

##### QUALITY AND COMPLETENESS ISSUES
1. Handle missing values
2. De-label Pclass to make the data easier to interpret.
3. Sib(SIBLING) and Sp(SPOUSE) in one column.
4. Parch in one column (Par = PARENTS ; ch = CHILDREN)
5. Decide whether or not to keep Cabin.
6. Merging Sibsp and Parch into one column called 'Fam' meaning family.
7. Separate out title from name
8. Encode the 'Sex' and 'Embarked' columns
9. Drop the 'Ticket' column
10. Drop Name after feature engineering
11. If not significant, drop PassengerId

##### TIDINESS ISSUES
1. Sib(SIBLING) and Sp(SPOUSE) in one column.
2. Parch in one column (Par = PARENTS ; ch = CHILDREN)
3. Pclass in un-interpretable format
4. Merging Sibsp and Parch into one column called 'Fam' meaning family.
5. Separate out title from name and then drop it
6. Drop the 'Ticket' column
7. Drop Name after feature engineering
8. If not significant, drop PassengerId

##### VISUALIZING THE DATA BEFORE PROCESSING THE DATA

In [None]:
td.hist(figsize=(20,10), color='maroon', bins=25)
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.distplot(td[td.Survived==1]['Age'])
sns.distplot(td[td.Survived==0]['Age'])
plt.legend(['SURVIVED','DID NOT SURVIVE'])
plt.show()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='SURVIVED', x=list(td.Sex.value_counts().index), y=td[td.Survived==1]['Sex'].value_counts().values),
    go.Bar(name='DID NOT SURVIVE', x=list(td.Sex.value_counts().index), y=td[td.Survived==0]['Sex'].value_counts().values)
])
fig.update_layout(barmode='group',title="SEX")
fig.show()

fig = go.Figure(data=[go.Pie(labels=['MALES','FEMALES'],
                             values=[td[(td.Sex=='male') & (td.Survived==1)].shape[0],td[(td.Sex=='female') & (td.Survived==1)].shape[0]])])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=['yellow','purple'],line=dict(color='#000000', width=2)))
fig.show()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='SURVIVED', x=list(td.Pclass.value_counts().index), y=td[td.Survived==1]['Pclass'].value_counts().values),
    go.Bar(name='DID NOT SURVIVE', x=list(td.Pclass.value_counts().index), y=td[td.Survived==0]['Pclass'].value_counts().values)
])
fig.update_layout(barmode='group',title="PCLASS")
fig.show()

fig = go.Figure(data=[go.Pie(labels=['1st CLASS','2nd CLASS','3rd CLASS'],
                             values=[td[(td.Pclass==1) & (td.Survived==1)].shape[0],td[(td.Pclass==2) & (td.Survived==1)].shape[0],td[(td.Pclass==3) & (td.Survived==1)].shape[0]])])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=['yellow','lightgreen','darkorange'],line=dict(color='#000000', width=2)))
fig.show()

In [None]:
fig = go.Figure(data=[
    go.Bar(name='SURVIVED', x=list(td.Embarked.value_counts().index), y=td[td.Survived==1]['Embarked'].value_counts().values),
    go.Bar(name='DID NOT SURVIVE', x=list(td.Embarked.value_counts().index), y=td[td.Survived==0]['Embarked'].value_counts().values)
])
fig.update_layout(barmode='group',title="EMBARKED")
fig.show()

fig = go.Figure(data=[go.Pie(labels=['S','C','Q'],
                             values=[td[(td.Embarked=='S') & (td.Survived==1)].shape[0],
                                     td[(td.Embarked=='C') & (td.Survived==1)].shape[0],
                                     td[(td.Embarked=='Q') & (td.Survived==1)].shape[0]])])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
                  marker=dict(colors=['maroon','pink','darkturquoise'],line=dict(color='#000000', width=2)))
fig.show()

##### PRIMARY CONCLUSIONS DERIVED
-  Most of the people were of the age range 20-40
-  Maximum number of people bought tickets of fare < 100.
-  Most of the people were travelling alone, without parents, children, siblings or spouse.
-  The highest numbe of passengers were from Pclass 3.
    -  Most number of people from Pclass 3 did not survive. This is evident as they were passengers of the inferior class.
    -  Since facilities were more easily available for 1st class passengers, hence survival:deceased ratio is higher for them(39.8%)
-  Survival:Deceased ratio is higher in females than males. Maybe due to the 'women and children' first policy.
    -  More females survived(233) than males(109)
-  Highest survival rate is for 'Embarked' class 'S'(63.8%)

##### FEATURE CORRELATION BEFORE PROCESSING

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(td.corr(),annot=True,linewidth=1,linecolor='white')
plt.show()

##### CORRELATION ANALYSIS
1. Pclass and Fare show high anti-correlation. This is expected. A lower Pclass(1) values actually indicates a higher social/economic strata, like the Royals,the Aristocrats etc. So as Fare increases, numeric value of Pclass drops, and it concentrate more towarda '1', meaning, a higher class person.
2. Pclass and Survived show high anti-correlation. This is expected as first class passengers' safety was given more importance than other classes.
3. Survived and Fare have a faintly moderate correlation. First class passengers' safety was given more importance than other classes, and obviously high class passengers paid more Fare than lower class passengers.
4. Parch and Sibsp have moderate correlation, maybe bacuse they fall under 'family' only.

##### DATA WRANGLING
We see that Fare and Pclass are highly correlated(anti-correlation) to each other than any other features(magnitude>0.50). A low Pclass(means a higher class) will automatically mean a high Fare. We remove Fare.
Pclass can be further processed.

In [None]:
td.drop(columns=['Fare'],inplace=True)
td.head()

##### COMPLETENESS ISSUES
1. Handle missing data

In [None]:
print(td.isnull().sum())
print("\nFraction of values that are missing in the 'Cabin' feature : ", (td.Cabin.isnull().sum()/td.shape[0]))

###### More than 77% of the values in Cabin are missing. Since it is impossible to replace so many missing values without introducing errors, we remove the feature named 'Cabin'.

In [None]:
td.drop(columns=['Cabin'],inplace=True)
td.head()

##### Replace missing Age values with mean.
1. We can directly replace the missing values by the mean of the ages value.
2. Another method could be to fill the missing values based on 'Pclass' based grouping. For example, fill in the missing value of a particular 'Age' entry, which has a Pclass say '1', with a value that is the mean of the all ages corresponding to that particular Pclass only. Idea source : https://www.kaggle.com/thomaswoolley/rf-and-k-nn-titanic-0-79-score

In [None]:
td1 = td.copy()
td2 = td.copy()

##### 1st METHOD : THE ONE I GENERALLY DO

In [None]:
td1.Age.fillna(np.mean(td.Age),inplace=True)
td1.Age.isnull().sum()

##### 2nd METHOD : BASED ON PCLASS
1. Group the dataset on the basis of Pclass and for every Pclass, find the mean of the ages. Store them orderwise in a list.
2. Now, loop through the Pclass values ->
    -  For every Pclass, say i:
        -  Pick all rows under the 'Age' column for that Pclass and replace the NaN values with the corresponding mean from the previously created list.
        (For example, for Pclass 1, we'll replace NaN values with the 1st value of the list.)
       end loop.
   Done.

###### FINALLY, PLOT A DISTPLOT TO CHECK NEW DISTRIBUTION OF AGES. DOES OUR PRIMARY OBSERVATION DEVIATE AFTER PROCESSING ?

In [None]:
print(td2.groupby('Pclass')['Age'].mean())
mean_list = list(td2.groupby('Pclass')['Age'].mean().values)
print("\nList of means of Ages grouped according to Pclass",mean_list)

In [None]:
# Replacing by looping through Pclass values. Total number of unique values of Pclass = 3. So the loop runs 3 times.
for i in range(3):
    td2.loc[td2['Pclass']==i+1,'Age'] = td2.loc[td2['Pclass']==i+1,'Age'].fillna(mean_list[i])
print(td2.Age.isnull().sum())
td = td2

In [None]:
td.isnull().sum()

In [None]:
plt.figure(figsize=(20,10))
sns.distplot(td['Age'])
plt.show()

##### Replacing the two missing values in 'Embarked' with the most common value under this feature(handling missing categorical data)
Let's find out the most popular 'Embarked' type

In [None]:
print("Most popular type : ", td.Embarked.value_counts().sort_values(ascending=False).index[0])
to_replace = td.Embarked.value_counts().sort_values(ascending=False).index[0]
sns.countplot(x='Embarked',data=td)
plt.show()

In [None]:
td.Embarked.fillna(to_replace,inplace=True)
td.isnull().sum()

##### ALL MISSING VALUES HAVE BEEN HANDLED.
##### QUALITY ISSUES
##### 1. Merge Sibsp and Parch into one column and drop the other two

In [None]:
td['Fam'] = td['SibSp'] + td['Parch']
td.drop(columns=['SibSp','Parch'],inplace=True)
td.head(20)

##### Plot to check distribution

In [None]:
plt.figure(figsize=(20,10))
sns.distplot(td['Fam'])
plt.title("DISTRIBUTION OF FAMILY")
plt.show()

plt.figure(figsize=(20,10))
sns.countplot(x='Fam',data=td,hue='Pclass')
plt.title("NO. OF FAMILY MEMBERS VS PCLASS")
plt.show()

##### CONCLUSIONS :
1. Most people were travelling alone.
2. The highest number of family members one was travelling with was 10
3. Maximum number of people travelling alone were from 3rd class.
4. No person from 1st or 2nd class travelled with > 5 family members,

##### 3. Remove PassengerId and Ticket

In [None]:
td.drop(columns=['PassengerId','Ticket'],inplace=True)
td.head(10)

##### 2. FEATURE ENGINEERING
-  Separate Title from Name and then drop Name
-  Decode Pclass and OHE it.
-  OHE Sex
-  OHE Embarked


1. Separate Title from Name


-  Create a new column named 'Title'
-  Apply regex to extract title from name
    -  Check for anomalies in the extracted titles and clean the untidy data.
-  Drop Name

In [None]:
td['Title'] = td['Name']

# Apply regex per name
# Use function : Series.str.extract()
for name in td['Name']:
    td['Title'] = td['Name'].str.extract('([A-Za-z]+)\.',expand=True)    # Regex to get title : ([A-Za-z]+)\.

# Drop Name
td.drop(columns=['Name'],inplace=True)
td.head()

In [None]:
# Check extracted data for quality
td.Title.unique()

##### NOTE : Found unmatching titles like -> 'Don','Rev','Mme','Ms','Major','Lady','Sir','Mlle','Col','Capt','Countess','Jonkheer'

In [None]:
title_mapping = {'Don':'Rare','Rev':'Rare','Mme':'Miss','Ms':'Miss','Major':'Rare','Lady':'Royal','Sir':'Royal','Mlle':'Miss','Col':'Rare','Capt':'Rare','Countess':'Royal','Jonkheer':'Royal'}

td.replace({'Title':title_mapping},inplace=True)
td.Title.unique()

2. Decode Pclass

In [None]:
td['Pclass_new']=np.nan
rep_list = ['first','second','third']

# Decode manually for all 3 columns
for i in range(3):
    td.loc[td['Pclass']==i+1,'Pclass_new'] = rep_list[i]
    
# Drop Pclass
td.drop(columns=['Pclass'],inplace=True)
td.head()

In [None]:
td.isnull().sum()

3. Perform OHE on Pclass_new, Sex, Embarked and Title

In [None]:
# Use pd.get_dummies(data,drop_first)
encd_col = ['Pclass_new','Sex','Embarked','Title']

ohe_features = pd.get_dummies(data=td.loc[:,encd_col],drop_first=True)   # In OHE we usually create k-1 encoded features for k classes.
# Drop original columns
td.drop(columns=encd_col,inplace=True)
td = td.join(ohe_features)
td.head(10)

##### FINAL CHECK OF CURRENT CORRELATION

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(td.corr(),annot=True)
plt.show()

##### LOOKS PRETTY OKAY.
##### OUR TRAINING DATASET IS READY. LET'S CHECK OUR TESTING DATASET

In [None]:
tstd = pd.read_csv('/kaggle/input/titanic/test.csv')
tstd.head()

### So as we can see, we have to repeat all steps we did on the training data to make it an appropriate testing dataset.

##### We know which columns to drop. We drop them without further analysis

In [None]:
tstd.drop(columns=['Ticket','Fare','Cabin'],inplace=True)              # We don't drop PassengerID because we need it for creating o/p file
tstd.isnull().sum()

##### Filling in the missing values in Age

In [None]:
age_lst = list(tstd.groupby('Pclass')['Age'].mean().values)
for i in range(3):
    tstd.loc[tstd['Pclass']==i+1,'Age'] = tstd.loc[tstd['Pclass']==i+1,'Age'].fillna(age_lst[i])
tstd.isnull().sum()

##### Extracting and handling Title, simultaneously dropping Name

In [None]:
tstd['Title'] = tstd['Name']
for i in tstd['Name']:
    tstd['Title'] = tstd['Name'].str.extract('([A-Za-z]+)\.',expand=True)
# Dropping Name
tstd.drop(columns=['Name'],inplace=True)
# Replacing by mapping
title_mapping = {'Don':'Rare','Rev':'Rare','Mme':'Miss','Ms':'Miss','Major':'Rare','Dona':'Royal','Mlle':'Miss','Col':'Rare','Capt':'Rare'}

tstd.replace({'Title':title_mapping},inplace=True)
print(tstd.Title.unique())
tstd.head()

##### Handling Pclass

In [None]:
tstd['Pclass_new'] = np.nan
new_pc = ['first','second','third']
for i in range(3):
    tstd.loc[tstd.Pclass==i+1,'Pclass_new'] = new_pc[i]
tstd.drop(columns=['Pclass'],inplace=True)
tstd.head()

##### Combining SibSp and Parch to Fam

In [None]:
tstd['Fam'] = tstd['SibSp'] + tstd['Parch']
tstd.drop(columns=['SibSp','Parch'],inplace=True)
tstd.head()

##### OHE features Sex, Embarked, Title, Pclass_new

In [None]:
# Use pd.get_dummies(data,drop_first)
encd_col1 = ['Pclass_new','Sex','Embarked','Title']

ohe_features2 = pd.get_dummies(data=tstd.loc[:,encd_col1],drop_first=True)   # In OHE we usually create k-1 encoded features for k classes.
# Drop original columns
tstd.drop(columns=encd_col1,inplace=True)
tstd = tstd.join(ohe_features2)
tstd.head(10)

##### COMPARING TRAIN AND TEST DATA

In [None]:
td.head()

In [None]:
tstd.head()

##### LOOKS GOOD.
##### NOW WE START BUILDING THE MODELS
We'll focus on these models :
1. Logistic Regression (as it is classification based)
2. KNN
3. Random Forest
4. Adaboost

##### LOGISTIC REGRESSION

In [None]:
# Split the training data by the conventional 80-20 split
X = td.drop(columns=['Survived'])
Y = td['Survived'].values
trainx, testx, trainy, testy = train_test_split(X,Y,test_size=0.2)
x,y = np.array(td.iloc[:,1:].values),np.array(td.iloc[:,0].values)
test = np.array(tstd.iloc[:,:].values)
print("Train : ",trainx.shape,trainy.shape)
print("Test : ",testx.shape,testy.shape)

# Creating the model
logr = LogisticRegression(penalty='l2',C=1.0,solver='lbfgs')
logr.fit(trainx,trainy)

# Preds and accuracy
y_pred1 = logr.predict_proba(testx)
# We are interested in the True and False Positives only.
fptp = y_pred1[:,1]  # As 2nd value tells the probability of getting a 1

# Getting the ROC-AUC score and plotting the ROC curve
logr_score = roc_auc_score(testy,fptp)
print("ROC AUC score = ",logr_score)
lr_fp,lr_tp,_ = roc_curve(testy,fptp)   # Returns FPR, TPR and thresholds.
plt.figure(figsize=(20,10))
plt.plot(lr_fp,lr_tp,marker='.',label="Logistic Regression ROC Curve")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

## Confusion matrix and accuracy score

In [None]:
y_pred2 = logr.predict(testx)
tn,fp,fn,tp = confusion_matrix(testy,y_pred2).ravel()
acc1 = (tp+tn)/(tp+tn+fp+fn)
print(acc1)

## Creating output file

In [None]:
op1 = logr.predict(tstd.drop(columns=['PassengerId'],axis=1))
opf_df1 = pd.DataFrame({'PassengerId': tstd.PassengerId, 'Survived': op1})
opf_df1.to_csv('Balaka_LGR.csv', index=False)

## KNN

In [None]:
knn = KNeighborsClassifier()
scaler = StandardScaler()
trainx_scaled = scaler.fit_transform(trainx)

# Hyperparameter tuning
param_grid = {
    'n_neighbors' : [3,5,7,9],
    'weights' : ['uniform','distance'],
    'metric' : ['euclidean','manhattan','minkowski'],
    'algorithm' : ['auto','ball_tree','kd_tree','brute']
}
knn_gs = GridSearchCV(estimator=knn,param_grid=param_grid,cv=10)
knn_gs.fit(trainx_scaled,trainy)
print(knn_gs.best_score_)
print(knn_gs.best_params_)

In [None]:
# Defining the knn classifier
knn_best = KNeighborsClassifier(n_neighbors=knn_gs.best_params_.get('n_neighbors'),weights=knn_gs.best_params_.get('weights'),algorithm=knn_gs.best_params_.get('algorithm'),metric=knn_gs.best_params_.get('metric'))
knn_best.fit(trainx,trainy)

# ROC-AUC score
y_pred3 = knn_best.predict_proba(testx)
# We are interested in the True and False Positives only.
fptp2 = y_pred3[:,1]  # As 2nd value tells the probability of getting a 1

# Getting the ROC-AUC score and plotting the ROC curve
knn_score = roc_auc_score(testy,fptp2)
print("ROC AUC score = ",knn_score)
lr_fp2,lr_tp2,_ = roc_curve(testy,fptp2)   # Returns FPR, TPR and thresholds.
plt.figure(figsize=(20,10))
plt.plot(lr_fp2,lr_tp2,marker='.',label="KNN ROC Curve")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

## Confusion matrix and accuracy score

In [None]:
y_pred4 = knn_best.predict(testx)
tn,fp,fn,tp = confusion_matrix(testy,y_pred4).ravel()
acc2 = (tp+tn)/(tp+tn+fp+fn)
print(acc2)

## Creating O/P file

In [None]:
op2 = knn_best.predict(tstd.drop(columns=['PassengerId'],axis=1))
opf_df2 = pd.DataFrame({'PassengerId': tstd.PassengerId, 'Survived': op2})
opf_df2.to_csv('Balaka_KNN.csv', index=False)

## Random Forest Classifier

In [None]:
rf = RandomForestClassifier()

# Hyperparameter tuning
param_grid = {
    'n_estimators' : [80,90,100],
    'criterion' : ['gini','entropy'],
    'max_depth' : [5,6,7,9],
    'max_features' : ['auto','sqrt','log2']
}
rf_gs = GridSearchCV(estimator=rf,param_grid=param_grid,cv=10)
rf_gs.fit(trainx,trainy)
print(rf_gs.best_score_)
print(rf_gs.best_params_)

In [None]:
# Defining the rf classifier
rf_best = RandomForestClassifier(n_estimators=rf_gs.best_params_.get('n_estimators'),criterion=rf_gs.best_params_.get('criterion'),max_depth=rf_gs.best_params_.get('max_depth'),max_features=rf_gs.best_params_.get('max_features'))
rf_best.fit(trainx,trainy)

# ROC-AUC score
y_pred5 = rf_best.predict_proba(testx)
# We are interested in the True and False Positives only.
fptp3 = y_pred5[:,1]  # As 2nd value tells the probability of getting a 1

# Getting the ROC-AUC score and plotting the ROC curve
rf_score = roc_auc_score(testy,fptp3)
print("ROC AUC score = ",rf_score)
lr_fp3,lr_tp3,_ = roc_curve(testy,fptp3)   # Returns FPR, TPR and thresholds.
plt.figure(figsize=(20,10))
plt.plot(lr_fp3,lr_tp3,marker='.',label="RF ROC Curve")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

## Confusion matrix and accuracy score.

## Creating O/P file

In [None]:
op3 = rf_best.predict(tstd.drop(columns=['PassengerId'],axis=1))
opf_df3 = pd.DataFrame({'PassengerId': tstd.PassengerId, 'Survived': op3})
opf_df3.to_csv('Balaka_RF2.csv', index=False)

## AdaBoostClassifier

In [None]:
adb = AdaBoostClassifier()

# Hyperparameter tuning
param_grid = {
    'n_estimators' : [20,30,40,50],
    'algorithm' : ['SAMME', 'SAMME.R']
}
adb_gs = GridSearchCV(estimator=adb,param_grid=param_grid,cv=10)
adb_gs.fit(trainx,trainy)
print(adb_gs.best_score_)
print(adb_gs.best_params_)

In [None]:
# Defining the adb classifier
adb_best = AdaBoostClassifier(n_estimators=adb_gs.best_params_.get('n_estimators'),algorithm=adb_gs.best_params_.get('algorithm'))
adb_best.fit(trainx,trainy)

# ROC-AUC score
y_pred7 = adb_best.predict_proba(testx)
# We are interested in the True and False Positives only.
fptp4 = y_pred7[:,1]  # As 2nd value tells the probability of getting a 1

# Getting the ROC-AUC score and plotting the ROC curve
adb_score = roc_auc_score(testy,fptp4)
print("ROC AUC score = ",adb_score)
lr_fp4,lr_tp4,_ = roc_curve(testy,fptp4)   # Returns FPR, TPR and thresholds.
plt.figure(figsize=(20,10))
plt.plot(lr_fp4,lr_tp4,marker='.',label="AdaBoost ROC Curve")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

## Confusion matrix and accuracy

In [None]:
y_pred8 = adb_best.predict(testx)
tn,fp,fn,tp = confusion_matrix(testy,y_pred8).ravel()
acc4 = (tp+tn)/(tp+tn+fp+fn)
print(acc4)

## Creating the O/P file

In [None]:
op4 = adb_best.predict(tstd.drop(columns=['PassengerId'],axis=1))
opf_df4 = pd.DataFrame({'PassengerId': tstd.PassengerId, 'Survived': op4})
opf_df4.to_csv('Balaka_ADBoost.csv', index=False)

## Comparison

In [None]:
names = ['Logistic Regression','KNN','Random Forest','Adaboost']
vals = [acc1,knn_gs.best_score_,rf_gs.best_score_,adb_gs.best_score_]
res_df = pd.DataFrame({'Algorithm': names,'Accuracy': vals})
res_df

## XGBoost

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
param_grid = { 
    'learning_rate' : [0.1, 0.2],
    'max_depth': [3, 5, 7],   
}

xgb_gs = GridSearchCV(estimator = xgb,param_grid=param_grid,cv=3)
xgb_gs.fit(trainx,trainy)
print(xgb_gs.best_score_)
print(xgb_gs.best_params_)

In [None]:
xgb_best = XGBClassifier(learning_rate=xgb_gs.best_params_.get('learning_rate'),max_depth=xgb_gs.best_params_.get('max_depth'))
xgb_best.fit(trainx,trainy)

# ROC-AUC score
y_pred11 = xgb_best.predict_proba(testx)
# We are interested in the True and False Positives only.
fptp6 = y_pred11[:,1]  # As 2nd value tells the probability of getting a 1

# Getting the ROC-AUC score and plotting the ROC curve
xgb_score = roc_auc_score(testy,fptp6)
print("ROC AUC score = ",xgb_score)
lr_fp6,lr_tp6,_ = roc_curve(testy,fptp6)   # Returns FPR, TPR and thresholds.
plt.figure(figsize=(20,10))
plt.plot(lr_fp6,lr_tp6,marker='.',label="XGBoost ROC Curve")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
y_pred9 = xgb_best.predict(testx)
tn,fp,fn,tp = confusion_matrix(testy,y_pred9).ravel()
acc5 = (tp+tn)/(tp+tn+fp+fn)
print(acc5)

In [None]:
op5 = xgb_best.predict(tstd.drop(columns=['PassengerId'],axis=1))
opf_df5 = pd.DataFrame({'PassengerId': tstd.PassengerId, 'Survived': op5})
opf_df5.to_csv('Balaka_XGBoost2.csv', index=False)