# Reading the train and test files

In [343]:
import warnings
warnings.filterwarnings('ignore')

In [344]:
import pandas as pd
train=pd.read_csv('../input/titanic/train.csv')
test=pd.read_csv('../input/titanic/test.csv')
combine=[train,test] 

In [345]:
# Passenger ID can be dropped in train dataset but since we need it to prepare the submission file , we are keeping it in test.csv
train.drop(['PassengerId'],axis=1,inplace=True)
train.head()

### Checking for missing values

In [346]:
print(train.info())

print(test.info())

### There are considerable number of missing values in Cabin in both test and train dataset. Hence dropping the same. 
There are missing values in Age, Embarkeded and Fare as well. But since the numbers are small, they can be managed. 

In [347]:
for dataset in combine:
    dataset.drop(['Cabin'],axis=1,inplace=True)
    

In [348]:
# Checking if Ticket numbers are unique
for data in combine:
    print('Dataset has ',data.shape[0],' entries and ',end=' ')
    print(data['Ticket'].nunique(),' values of Tickets')

In [349]:
# Working with the missing values in Embarked

train[((train.Fare>78)&(train.Fare<83) )& (train['Pclass']==1)]

In [350]:
# Replacing with the mode of the value of people in same class and similar fare. 
train['Embarked']=train['Embarked'].fillna('C')

# Does group and individual booking have an impact on survival rate?

In [351]:
train[train['Ticket']=='W./C. 6607']

#### It seems that there are people who booked the ticket together, usually belonging to the same family and have same last name.

In [352]:
for data in combine:
    group_bookings_tickets=data[data['Ticket'].duplicated()]['Ticket'].unique()
    data['group_booking']=[1 if ticket in group_bookings_tickets else 0 for ticket in data['Ticket']]

In [353]:
train[['Survived','group_booking']].groupby('group_booking').mean()

#### Group booking has a higher rate of survival. 

In [354]:
# Dropping the Ticket column as we hev extracted information from it

for data in combine:
    data.drop(['Ticket'],axis=1,inplace=True)

In [355]:
train.head()

# Does gender of a person has an impact on his survival rate?


In [356]:
## Label encoding the Sex column

for data in combine:
    data['Sex']=[1 if sex=='male' else 0 for sex in data['Sex']]

In [357]:
# Checking the sruvival rate

train[['Sex','Survived']].groupby(['Sex']).mean()

#### Females have higher chance survival

# Does the Titles have an impact on Survival Rate?


In [358]:
for data in combine:
    data['Salutation']=data['Name'].str.extract(' (\w+)\.')

In [359]:
train[['Salutation','Survived']].groupby(['Salutation']).count().sort_values(by=['Survived'],ascending=False)

In [360]:
for data in combine:
    data['Salutation']=data['Salutation'].replace(['Dr', 'Rev', 'Major', 'Col', 'Mlle','Mme', 'Ms', 'Capt', 'Lady', 'Jonkheer', 'Don', 'Countess', 'Sir','Dona'],'Others')
    # Converting groups with less than 10 count to a common group called others and label encoding others
    data['Title']=data['Salutation'].map({'Mr':0,'Miss':1,'Mrs':2,'Master':3,'Others':4})
    data.drop(['Name'],axis=1,inplace=True)

In [361]:
train[['Salutation','Survived']].groupby(['Salutation']).mean()

#### Passengers with Title Mrs had higher chances of surviving, followed by Miss and childred.


In [362]:
for data in combine:
    data.drop(['Salutation'],axis=1,inplace=True) # deleting the column as we have already label encoded it. 
    

In [363]:
train.head()

In [364]:
#({'Mr':0,'Miss':1,'Mrs':2,'Master':3,'Others':4})
for data in combine:
    import numpy as np
    Mr=np.zeros(data.shape[0])
    Miss=np.zeros(data.shape[0])
    Mrs=np.zeros(data.shape[0])
    Master=np.zeros(data.shape[0])
    for i,value in enumerate(data['Title']):
        try:
            if int(value)==0:
                Mr[i]=1
            elif int(value)==1:
                Miss[i]=1
            elif int(value)==2:
                Mrs[i]=1
            elif int(value)==3:
                Master[i]=1
        except:
            print(i,value)
    data['Mr']=Mr.astype(int)
    data['Mrs']=Mrs.astype(int)
    data['Miss']=Miss.astype(int)
    data['Master']=Master.astype(int)



In [365]:
# Checking the dependence of Title on Survival rate
train[['Title','Survived']].groupby('Title').mean()

#### People with Title Miss and Mrs has higher survival rate followed by children. The least survival rate being that of Mr

In [366]:
# Dropping the column Title
for data in combine:
    data.drop(['Title'],axis=1,inplace=True)

# Checking the dependence of Embarked Column on Survival Rate

In [367]:
for data in combine:
    data['Embarked']=data['Embarked'].map({'S':0,'C':1,'Q':2})

In [368]:
train[['Embarked','Survived']].groupby(['Embarked']).mean()

#### People who have started the Journey from Cherbourg has a higher survival rate compared to other boarding points

In [369]:
train.head()

# Handling missing values in Age

In [370]:
train[train['Age'].isna()]

In [371]:
mr_mean=train[(train['Age'].notna()) & (train['Mr']==1)]['Age'].mean()
miss_mean=train[(train['Age'].notna()) & (train['Miss']==1)]['Age'].mean()
mrs_mean=train[(train['Age'].notna()) & (train['Mrs']==1)]['Age'].mean()
master_mean=train[(train['Age'].notna()) & (train['Master']==1)]['Age'].mean()
others_mean=train[(train['Age'].notna()) & (train['Master']+train['Mr']+train['Mrs']+train['Miss']==1)]['Age'].mean()

In [372]:
for data in combine:
    for i,row in data.iterrows():
        if np.isnan(row['Age']):
            if row['Mr']==1:
                data.loc[i,'Age']=mr_mean
            elif row['Miss']==1:
                data.loc[i,'Age']=miss_mean
            elif row['Mrs']==1:
                data.loc[i,'Age']=mrs_mean
            elif row['Master']==1:
                data.loc[i,'Age']=master_mean
            else:
                data.loc[i,'Age']=others_mean

In [373]:
train[train['Age'].isna()]

# Correcting the missing fare in test dataset

In [374]:
test[test['Fare'].isna()]

In [375]:
# Replacing the value of embarked in test dataset with the mean of value with same pclass and same embarked

test['Fare']=test['Fare'].fillna(train[(train['Pclass']==3) & (train['Embarked']==0)& (train['Mr']==1) &(train['group_booking']==0) & (train['Sex']==1) & ( train['Age']>60)]['Fare'].mean())

In [376]:
test[test['Fare'].isna()]

# Checking for correlation between parameters

In [377]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))
sns.heatmap(train.corr(),annot=True)

# Trying out different machine learning models

In [378]:
class automl:
    def __init__(self,combine,target):
        self.xtrain=combine[0].drop([target],axis=1)
        self.ytrain=combine[0][target]
        self.test=combine[1]
        
        from sklearn.model_selection import train_test_split
        self.x_train,self.x_test,self.y_train,self.y_test=train_test_split(self.xtrain,self.ytrain,test_size=0.3,random_state=1)
        # Initialising the models to None. 
        # The models will be saved to these objects once the functions are called
        self.logitRegression=None
        self.decisionTree=None
        self.NaiveBayes=None
        self.randomForest=None
        self.MLP=None
        self.DNN=None
        self.output=None
        self.XGB=None
        self.vClassifier=None
        self.knn=None
        
    def linear_regression(self):
        from sklearn.linear_model import LogisticRegression
        from sklearn.model_selection import KFold
        from sklearn.model_selection import cross_val_score
        lr=LogisticRegression(solver='liblinear')
        kf=KFold(n_splits=5)
        for tr,te in kf.split(self.xtrain):
            lr.fit(self.xtrain.iloc[tr],self.ytrain.iloc[tr])
            print(lr.score(self.xtrain.iloc[te],self.ytrain.iloc[te]))
        self.MLP=lr.fit(self.x_train,self.y_train)
        ret_model=lr
        self.logitRegression=lr.fit(self.xtrain,self.ytrain)
        return ret_model
        
    
    def mlp(self):
        from sklearn.neural_network import MLPClassifier
        from sklearn.model_selection import KFold
        model=MLPClassifier(hidden_layer_sizes=(256,128,64,32,16,8),random_state=1,max_iter=50,solver='adam')
        kf=KFold(n_splits=5)
        for tr,te in kf.split(self.xtrain):
            model.fit(self.xtrain.iloc[tr],self.ytrain.iloc[tr])
            print(model.score(self.xtrain.iloc[te],self.ytrain.iloc[te]))
        ret_model=model
        self.MLP=model.fit(self.xtrain,self.ytrain)
        return ret_model
        
    def predict(self,model):
        passenger=self.test['PassengerId']
        test=self.test.drop(['PassengerId'],axis=1)
        predictions=model.predict(test)
        out=pd.DataFrame()
        out['PassengerId']=passenger
        out['Survived']=predictions
        self.out=out
    
    def decision_tree(self):
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.model_selection import KFold
        dt=DecisionTreeClassifier()
        kf=KFold(n_splits=5)
        for tr,te in kf.split(self.xtrain):
            dt.fit(self.xtrain.iloc[tr],self.ytrain.iloc[tr])
            print(dt.score(self.xtrain.iloc[te],self.ytrain.iloc[te]))
        ret_model=dt
        self.decisionTree=dt
        return ret_model
    
    def random_forest(self):
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import KFold
        rf=RandomForestClassifier()
        kf=KFold(n_splits=5)
        for tr,te in kf.split(self.xtrain):
            rf.fit(self.xtrain.iloc[tr],self.ytrain.iloc[tr])
            print(rf.score(self.xtrain.iloc[te],self.ytrain.iloc[te]))
        ret_model=rf
        self.randomForest=rf
        return ret_model
    
    def xgboost(self):
        from xgboost import XGBClassifier
        from sklearn.model_selection import KFold
        xgb=XGBClassifier()
        kf=KFold(n_splits=5)
        for tr,te in kf.split(self.xtrain):
            xgb.fit(self.xtrain.iloc[tr],self.ytrain.iloc[tr])
            print(xgb.score(self.xtrain.iloc[te],self.ytrain.iloc[te]))
        ret_model=xgb
        self.XGB=xgb.fit(self.xtrain,self.ytrain)
        return ret_model

    def naive_bayes(self):
        from sklearn.naive_bayes import GaussianNB
        from sklearn.model_selection import KFold
        gnb=GaussianNB()
        kf=KFold(n_splits=5)
        for tr,te in kf.split(self.xtrain):
            gnb.fit(self.xtrain.iloc[tr],self.ytrain.iloc[tr])
            print(gnb.score(self.xtrain.iloc[te],self.ytrain.iloc[te]))
        ret_model=gnb
        self.XGB=gnb.fit(self.xtrain,self.ytrain)
        return ret_model
    
    def KNN(self):
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.model_selection import KFold
        knn=KNeighborsClassifier()
        kf=KFold(n_splits=5)
        for tr,te in kf.split(self.xtrain):
            knn.fit(self.xtrain.iloc[tr],self.ytrain.iloc[tr])
            print(knn.score(self.xtrain.iloc[te],self.ytrain.iloc[te]))
        ret_model=knn
        self.knn=knn.fit(self.xtrain,self.ytrain)
        return ret_model
    
    
    def save_out(self,filename):
        self.out.to_csv(filename,index=False)
        
    def voting_classifier(self):
        from sklearn.ensemble import VotingClassifier,RandomForestClassifier,AdaBoostClassifier
        from sklearn.naive_bayes import GaussianNB
        from xgboost import XGBClassifier
        from sklearn.neural_network import MLPClassifier
        from sklearn.linear_model import LogisticRegression
        from sklearn.model_selection import KFold
        from sklearn.neighbors import KNeighborsClassifier
        estimators=[('abc',AdaBoostClassifier()),('mlp',MLPClassifier(hidden_layer_sizes=(256,128,64,32,16,8),random_state=1,max_iter=50,solver='adam')),('gnb',GaussianNB()),('lr',LogisticRegression(solver='liblinear',random_state=1)),('xgb',XGBClassifier(random_state=1)),('rf',RandomForestClassifier(random_state=1))]
        vClassifier=VotingClassifier(estimators=estimators,voting='hard')
        kf=KFold(n_splits=5)
        for tr,te in kf.split(self.xtrain):
            vClassifier.fit(self.xtrain.iloc[tr],self.ytrain.iloc[tr])
            print(vClassifier.score(self.xtrain.iloc[te],self.ytrain.iloc[te]))
        ret_model=vClassifier
        self.vClassifier=vClassifier.fit(self.xtrain,self.ytrain)
        return ret_model


In [379]:
auto=automl(combine,'Survived')

In [380]:
model=auto.KNN()
auto.predict(model)
# auto.save_out('knn.csv')

In [381]:
model=auto.mlp()
auto.predict(model)
# auto.save_out('MLP.csv')

In [382]:
model=auto.linear_regression()
auto.predict(model)
# auto.save_out('LR1.csv')

In [383]:
model=auto.decision_tree()
auto.predict(model)
# auto.save_out('DT.csv')

In [384]:
model=auto.random_forest()
auto.predict(model)
# auto.save_out('RF.csv')

In [385]:
model=auto.xgboost()
auto.predict(model)
# auto.save_out('XGB.csv')

In [386]:
model=auto.xgboost()
auto.predict(auto.XGB)
# auto.save_out('XGB1.csv')

In [387]:
model=auto.naive_bayes()
auto.predict(model)
# auto.save_out('XGB1.csv')

In [388]:
model=auto.voting_classifier()
auto.predict(model)
# auto.save_out('vclfnew.csv')

In [389]:
# Converting the Age and Fare into bins
from sklearn.preprocessing import LabelEncoder
for data in combine:
    features = ['Age', 'Fare']
    num_bins = 5
    for feature in features:
        bin_feature = feature + 'Bin'
        data[bin_feature] = pd.qcut(data[feature], num_bins)
        label = LabelEncoder()
        data[bin_feature] = label.fit_transform(data[bin_feature])


In [390]:
for data in combine:
    data.drop(['Age','Fare'],axis=1,inplace=True)

In [391]:
combine[0].head()

In [392]:
auto=automl(combine,'Survived')
model=auto.voting_classifier()
auto.predict(model)
auto.save_out('vclfwithadaboost.csv')

In [393]:
for data in combine:
    data.drop(['group_booking'],axis=1,inplace=True)

In [394]:
auto=automl(combine,'Survived')
model=auto.voting_classifier()
auto.predict(model)
auto.save_out('vclfafterdrop.csv')