In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
df=pd.read_csv("../input/titanic/train.csv")

In [None]:
df.info()

In [None]:
df_test=pd.read_csv("../input/titanic/test.csv")

In [None]:
df_test.info()

In [None]:
data=pd.concat([df,df_test],axis=0,sort=True)

In [None]:
data.head(5)

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.isnull(),yticklabels=False,cmap="viridis")

In [None]:
#Cabin has too much null values its better to drop it
data.drop("Cabin",axis=1,inplace=True)

In [None]:
#finding correlation between various columns
data.corr()

In [None]:
df["Embarked"].unique()

In [None]:
#filling the missing values in embarked
mode_1=df["Embarked"].mode()[0]
df["Embarked"].fillna(mode_1,inplace=True)

In [None]:
#Plotting the barplot graph of Age
plt.figure(figsize=(12,7))
plt.xlabel("Age")
data["Age"].plot(kind="hist",bins=50,color="grey")

In [None]:
#This plot clearly states that most passengers had no siblings
sns.countplot(x="SibSp",data=data)

In [None]:
plt.figure(figsize=(12,7))
corr=data.drop("Survived",axis=1).corrwith(data["Survived"])
corr.plot(kind="bar",rot=40)

In [None]:
#finding which gender does the most survivors belong to
sns.countplot(x="Survived",data=data,hue="Sex")
#It clearly indicates most survivors were females

In [None]:
sns.countplot("Survived",data=data,hue="Pclass")
plt.ylabel("PASSENGERS")
#The graph indicates that most survivors mostly belonged to Pclass 1

In [None]:
#This boxplot graph depicts the age groups of passenger class
sns.boxplot(x="Pclass",y="Age",data=data)

In [None]:
#Since the age groups are clearly shown in the above graph it is suitable to fill null values of ages via these classes
def impute_age(cols):
    Pclass=cols[0]
    Age=cols[1]
    if pd.isnull(Age):
        if Pclass==1:
            return 39
        elif Pclass==2:
            return 28
        else:
            return 22
    else:
        return Age
        
        

In [None]:
data["Age"]=data[["Pclass","Age"]].apply(impute_age,axis=1)

In [None]:
data["Name"]

In [None]:
data["Title"]=data["Name"].str.split(",").str[1].str.split(".").str[0].str.strip()

In [None]:
data["Title"].unique()

In [None]:
title_map={
    "Don":"VIP",
    "Miss":"Miss",
    "Master":"Mr",
    "Rev":"Staff",
    "Dr":"Staff",
    "Mme":"Miss",
    "Ms":"Miss",
    "Major":"VIP",
    "Lady":"Miss",
    "Sir":"Mr",
    "Mlle":"Miss",
    "Col":"Staff",
    "Capt":"Staff",
    "the Countess":"VIP",
    "Jonkheer":"VIP",
    "Dona":"VIP",
    "Mr":"Mr",
    "Mrs":"Mrs"
}

In [None]:
data["Title"]=data["Title"].map(title_map)

In [None]:
data.drop("Name",axis=1,inplace=True)

In [None]:
data.head(5)

In [None]:
#Further Feature Engineering of the data


In [None]:
data["mother"]=np.where((data.Title=="Mrs")& (data.Parch>0),1,0)

In [None]:
data["Free"]=np.where(data.Fare==0,1,0)

In [None]:
sns.countplot("Survived",data=data,hue="Embarked")
#It can be seen from the below graph that the lowest number of survivors are from Queenstown

In [None]:
#Extracting the numbers from the ticket column 
data["Unique_Numbers"]=data["Ticket"].str.split(" ").str[-1].str.strip()

In [None]:
#Replacing the only string in the Unique_Numbers column with 0

In [None]:
data["Unique_Numbers"].replace("LINE","0",inplace=True)

In [None]:
#Converting the string to integer values
data["Unique_Numbers"]=pd.to_numeric(data["Unique_Numbers"])

In [None]:
data["Unique_Numbers"]

In [None]:
data.head(5)

In [None]:
data.drop("Ticket",axis=1,inplace=True)

In [None]:
data.head(5)

In [None]:
data["Sex"]=data["Sex"].map({"male":1,"female":0})

In [None]:
data.head(5)

In [None]:
data["Embarked"].unique()

In [None]:
data.isnull().sum()

In [None]:
#filling the missing value in the embarked column with mode
mode_1=data["Embarked"].mode()[0]
data["Embarked"].fillna(mode_1,inplace=True)

In [None]:
data.isnull().sum()

In [None]:
#Filling the fare column missing value with mean 
mean_1=data["Fare"].mean()
data["Fare"].fillna(mean_1,inplace=True)

In [None]:
data.isnull().sum()

In [None]:
#data.drop("PassengerId",axis=1,inplace=True)

In [None]:
data.head(5)

In [None]:
#Converting the categorical values of embarked into numeric values
data["Embarked"]=data["Embarked"].map({"S":0,"C":1,"Q":2})

In [None]:
data.head(5)

In [None]:
# Mapping Fare
data.loc[ data['Fare'] <= 7.91, 'Fare'] = 0
data.loc[(data['Fare'] > 7.91) & (data['Fare'] <= 14.454), 'Fare'] = 1
data.loc[(data['Fare'] > 14.454) & (data['Fare'] <= 31), 'Fare']   = 2
data.loc[ data['Fare'] > 31, 'Fare'] = 3
data['Fare'] = data['Fare'].astype(int)

# Mapping Age
data.loc[ data['Age'] <= 16, 'Age'] = 0
data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
data.loc[ data['Age'] > 64, 'Age'] = 4 ;

In [None]:
#creating a count plot of title to assess which specific category was most present in the ship
sns.countplot("Title",data=data,palette="Accent")

In [None]:
data["Title"]=data["Title"].map({"Mr":0,"Mrs":1,"Miss":2,"VIP":3,"Staff":4})

In [None]:
data.isnull().sum()

In [None]:
data.head(5)

In [None]:
df=pd.DataFrame(data[:891:])

In [None]:
df.info()

In [None]:
df_test=pd.DataFrame(data[891::])

In [None]:
df_test.info()

In [None]:
df_test.head(5)

In [None]:
df_test.drop("Survived",axis=1,inplace=True)

In [None]:
df_test.head(5)

In [None]:
X=df.drop("Survived",axis=1)

In [None]:
y=df["Survived"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
#Using kfold cross validation
from sklearn.model_selection import KFold, cross_val_predict
kf = KFold(shuffle=True, random_state=42, n_splits=3)
for train_index, test_index in kf.split(X):
    X_train, X_test, y_train, y_test = (X.iloc[train_index, :], 
                                        X.iloc[test_index, :], 
                                        y[train_index], 
                                        y[test_index])

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators= 400,
 min_samples_split= 2,
 min_samples_leaf= 4,
 max_features= 'sqrt',
 max_depth= 70,
 bootstrap= True)
rfc.fit(X_train,y_train)

In [None]:
pred_1=rfc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
print(accuracy_score(y_test,pred_1))
print(confusion_matrix(y_test,pred_1))

In [None]:
#hyperparameters of random forest to be fine tuned
param_grids={'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rsc=RandomizedSearchCV(rfc,param_grids,cv=3,verbose=-1,random_state=42)
rsc.fit(X_train,y_train)
predict_7=rsc.predict(X_test)
print(confusion_matrix(y_test,predict_7))
rsc.best_params_

In [None]:
#Using xgboost classifier
from xgboost.sklearn import XGBClassifier
xgb=XGBClassifier(learning_rate=0.001,n_estimators=2500,
                                max_depth=4, min_child_weight=0,
                                gamma=0, subsample=0.7,
                                colsample_bytree=0.7,
                                scale_pos_weight=1, seed=27,
                                reg_alpha=0.00006)
xgb.fit(X_train,y_train)
predict_12=xgb.predict(X_test)
print(confusion_matrix(y_test,predict_12))


In [None]:
#Using decision tree classifier
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predict_14=dtree.predict(X_test)
print(confusion_matrix(y_test,predict_14))

In [None]:
#Using above algorithms uptill now we conclude that after hyperparameters tuning random forest classifier gives the best accuracy

In [None]:
#However there is another algorithm that can give more results that is adaboost classifier

In [None]:
#Using adaboost classifier
from sklearn.ensemble import AdaBoostClassifier
ad=AdaBoostClassifier(base_estimator=rfc,n_estimators=200,learning_rate=0.0001)
ad.fit(X_train,y_train)
predict_5=ad.predict(X_test)
print(accuracy_score(y_test,predict_5))

In [None]:
print(confusion_matrix(y_test,predict_5))

In [None]:
predicting=rsc.predict(df_test).astype(int)

In [None]:
submission=pd.DataFrame({"PassengerId":df_test["PassengerId"],"Survived":predicting})

In [None]:
print(submission)

In [None]:
submission=submission.set_index("PassengerId")

In [None]:
print(submission)

In [None]:
submission.to_csv("mashood_titanic.csv")