In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv('../input/titanic/train.csv',index_col='PassengerId')
test_data = pd.read_csv('../input/titanic/test.csv',index_col='PassengerId')

train_data.head()

In [None]:
train_data.info()

Some values in age column are null, and 2 values in embarked column. while most of the values in cabin column is null so it's better to drop the whole column. 

In [None]:
train_data.describe()

Drop those columns which won't be beneficial in prediction and modeling.

In [None]:
train_data.drop(['Cabin','Name','SibSp','Parch','Ticket','Fare'], inplace=True, axis=1)

In [None]:
train_data.head()

In [None]:
plt.figure(figsize=(10,10))

plt.subplot(3,2,1)
sns.countplot('Survived',data=train_data,hue='Sex')

plt.subplot(3,2,2)
sns.countplot('Pclass',data=train_data, hue='Sex')

plt.subplot(3,2,3)
sns.countplot('Sex',data=train_data)

plt.subplot(3,2,4)
sns.distplot(train_data['Age'],bins=30)

plt.subplot(3,2,5)
sns.countplot('Embarked',data=train_data, hue='Sex')

plt.subplot(3,2,6)
sns.boxplot(x='Pclass',y='Age',data=train_data)

Conclusions - 
1. Most decesead were males.
2. Most 3rd class person were men. so most 3rd class male were died.
3. Most passengers in ship were males.
4. Most of the people were aged around 20 to 40.
5. Average age of Pclass 1 was 37, Average age of Pclass 2 was 30 while Average age of Pclass 3 was 25.

Fill values in Age column which doesn't have any

In [None]:
def get_age(cols):
    age = cols[0]
    pclass = cols[1]
    
    if pd.isnull(age):
        if pclass == 1:
            return 37
        
        elif pclass==2:
            return 29
        
        else:
            return 24
    else:
        return age

train_data['Age'] = train_data[['Age','Pclass']].apply(get_age,axis=1)

In [None]:
train_data.groupby('Embarked').count()

In [None]:
train_data['Embarked'].unique()

Drop those rows which have nan value in age column

In [None]:
train_data = train_data.dropna(how='any',axis=0)

In [None]:
train_data['Embarked'].unique()

In [None]:
sex = pd.get_dummies(train_data['Sex'], prefix='Sex')
embarked = pd.get_dummies(train_data['Embarked'], prefix='Embarked')
pclass = pd.get_dummies(train_data['Pclass'], prefix='Pclass')

In [None]:
train_data = pd.concat([train_data, sex, embarked, pclass], axis=1)

In [None]:
# Drop the original columns
train_data.drop(['Sex', 'Embarked', 'Pclass'], axis=1, inplace=True)

In [None]:
train_data.head()

Start the training


In [None]:
X = train_data.drop('Survived',axis=1)
y = train_data['Survived']

In [None]:
X.head()

In [None]:
X.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7,random_state=0)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [None]:
model_logistic = LogisticRegression(solver='liblinear')
model_logistic.fit(X_train,y_train) # training logistic regression

In [None]:
def get_report(model, x_train,y_train,x_test,y_test):
    y_pred_train = model.predict(x_train)
    y_pred_test = model.predict(x_test)

    # accuracy score
    acc_train = accuracy_score(y_train,y_pred_train)
    acc_test = accuracy_score(y_test,y_pred_test)

    # f1 score
    f1_score_train = f1_score(y_train,y_pred_train,average='macro')
    f1_score_test = f1_score(y_test,y_pred_test,average='macro')


    print('Accuracy Train = %0.2f'%acc_train)
    print('Accuracy Test = %0.2f'%acc_test)
    print('F1 Score Train = %0.2f'%f1_score_train)
    print('F1 Score Test = %0.2f'%f1_score_test)

In [None]:
get_report(model_logistic,X_train,y_train,X_test,y_test)

In [None]:
model_rf = RandomForestClassifier(n_estimators=10,)
model_rf.fit(X_train,y_train)

In [None]:
get_report(model_rf,X_train,y_train,X_test,y_test)

Use Voting classifier to get the best of all these models

In [None]:
model_voting = VotingClassifier(estimators=[
    ('logistic',LogisticRegression(solver='liblinear')), # I have calculated these parametes using Grid search CV seperately
    ('rf',RandomForestClassifier(max_depth=5,n_estimators=5))
], voting='soft',weights=[2,1])

In [None]:
model_voting.fit(X_train,y_train)

In [None]:
get_report(model_voting,X_train,y_train,X_test,y_test)

Now it's the prediction time

In [None]:
test_data.head()

In [None]:
len(test_data)

In [None]:
test_data.drop(['Cabin','Name','SibSp','Parch','Ticket','Fare'], inplace=True, axis=1)

In [None]:
# But this test_data Embarked columns doesn't have na value so we don't need for it
#test_data['Embarked'].fillna('U',inplace=True)

In [None]:
def get_age(cols):
    age = cols[0]
    pclass = cols[1]
    
    if pd.isnull(age):
        if pclass == 1:
            return 37
        
        elif pclass==2:
            return 29
        
        else:
            return 24
    else:
        return age
    
test_data['Age'] = test_data[['Age','Pclass']].apply(get_age,axis=1)

In [None]:
sex = pd.get_dummies(test_data['Sex'], prefix='Sex')
embarked = pd.get_dummies(test_data['Embarked'], prefix='Embarked')
pclass = pd.get_dummies(test_data['Pclass'], prefix='Pclass')

In [None]:
test_data = pd.concat([test_data, sex, embarked, pclass], axis=1)

In [None]:
test_data.drop(['Sex', 'Embarked', 'Pclass'], axis=1, inplace=True)

In [None]:
test_data.head()

In [None]:
predictions = model_voting.predict(test_data)
predictions

In [None]:
test_data.index

In [None]:
submission = pd.DataFrame({"PassengerId": test_data.index,"Survived": predictions})
submission.to_csv('submission.csv', index=False)