In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')

df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.isnull().sum()

Cabin have a ot of Nan values, we will drop this column
We will replace Age Nan values with the age's median and the ones in embarked with'S', which are the most frequent 

In [None]:
df_train.drop('Cabin',axis=1,inplace=True)
df_train["Age"].fillna(df_train['Age'].median(),inplace=True)
df_train['Embarked'].fillna("S",inplace=True)
df_train['Fare'].fillna(df_train['Fare'].mean(), inplace=True)


df_test.drop('Cabin',axis=1,inplace=True)
df_test["Age"].fillna(df_test['Age'].median(),inplace=True)
df_test['Embarked'].fillna("S",inplace=True)
df_test['Fare'].fillna(df_test['Fare'].mean(), inplace=True)


### Visualization

Convert values to categorical (especially Sex)

In [None]:
df_train.dtypes

In [None]:
df_train['Sex'] = pd.Categorical(df_train['Sex']).codes
df_train['Embarked'] = pd.Categorical(df_train['Embarked']).codes

df_test['Sex'] = pd.Categorical(df_test['Sex']).codes
df_test['Embarked'] = pd.Categorical(df_test['Embarked']).codes

function to plot survival against passenger attribute

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

def survival_rate(column,t):
    df = pd.DataFrame()
    df['total'] = df_train.groupby(column).size()
    df['Survived'] = df_train.groupby(column).sum()['Survived']
    df['percentage'] = round(df['Survived']/df['total']*100,2)
    #print(df_train)
    
    df['Survived'].plot(kind=t)
    df['total'].plot(kind=t,alpha=0.5, title="Survivors per" +str(column))
    plt.show()

Deeper blue are those who survived and lighter blue those who dont

In [None]:
survival_rate("Sex","barh")
survival_rate("Pclass","barh")
survival_rate("Embarked","barh")
survival_rate("Parch","barh")
survival_rate("SibSp","barh")

### model training

In [None]:
from sklearn.model_selection import train_test_split

# Our columns for models building
predictors = ["Pclass","Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

X = df_train[predictors]
y =  df_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=1)

#### Logistic Regression: 

In [None]:
from sklearn.linear_model import  LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,y_train)
print('train accuracy =',clf.score(X_train,y_train))
print('test accuracy =',clf.score(X_test,y_test))

Let's visualize the probabilities

In [None]:
pred = pd.DataFrame({
    "Survived_original": y_test,
    "Survived_predicted": clf.predict(X_test),
    "Survived_proba": np.transpose(clf.predict_proba(X_test))[1]
})
pred['Comparison'] = pred['Survived_original'] == pred['Survived_predicted']
pred.head()

#### Bagging 

In [None]:
from sklearn.ensemble import BaggingClassifier
clf_log = LogisticRegression(random_state=1)
clf_bag = BaggingClassifier(base_estimator=clf_log, random_state=1, n_estimators=400, max_samples=0.75, max_features=0.85)
clf_bag.fit(X_train,y_train)
print('train accuracy =',clf_bag.score(X_train,y_train))
print('test accuracy =',clf_bag.score(X_test,y_test))

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf_dt = DecisionTreeClassifier(random_state = 1)
clf_dt.fit(X_train,y_train)
print('train accuracy =',clf_dt.score(X_train,y_train))
print('test accuracy =',clf_dt.score(X_test,y_test))

#### Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(random_state=1, n_estimators=20)
clf_rf.fit(X_train,y_train)
print('train accuracy =',clf_rf.score(X_train,y_train))
print('test accuracy =',clf_rf.score(X_test,y_test))

In [None]:
trees = range(50)
accuracy=np.zeros(50)
for index in range(len(trees)):
    clf_rf=RandomForestClassifier(random_state=1,n_estimators=index +1)
    clf_rf.fit(X_train,y_train)
    accuracy[index]=clf_rf.score(X_test,y_test)

plt.plot(trees,accuracy)
plt.ylabel('accuracy')
plt.xlabel('Number of trees')

Let's use Grid search to find the best combination of different of parameters

#### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'min_samples_leaf':list(range(1,5)),'min_samples_split':list(range(2,10,2)),
         'n_estimators':list(range(10,50,10))}
clf_rf2=RandomForestClassifier(random_state=1)
clf_gs=GridSearchCV(clf_rf2,params,scoring='accuracy',cv=5)
clf_gs.fit(X,y)

In [None]:
# print the best sore
clf_gs.best_score_

In [None]:
# print the best parameters selected
clf_gs.best_params_

Let'use these best parameters and check wether they achieve the above cv accuracy

In [None]:
clf_rf3 = RandomForestClassifier(random_state=1,min_samples_leaf=3,min_samples_split=8,n_estimators=30)
clf_rf3.fit(X_train,y_train)
print('train accuracy =',clf_rf3.score(X_train,y_train))
print('test accuracy =',clf_rf3.score(X_test,y_test))

from sklearn.model_selection import cross_val_score
scores_rf3 = cross_val_score(clf_rf3,X,y,scoring='accuracy',cv=5)
print("cross validation accuracy =", scores_rf3.mean())

As we can see, grid search allowed us to find the best model parameters to improve our accuracy.
Let's see the most important features of this classifier.

In [None]:
feat_imp = pd.DataFrame(clf_rf3.feature_importances_, predictors, columns=['Importance'])
feat_imp.sort_values('Importance',ascending=False)

### Predict on TEST DATA 

In [None]:
df_test.isna().sum()

In [None]:
df_submit = df_test.copy()
df_test = df_test[predictors]
submit_predictions = clf_rf3.predict(df_test)

my_submission = pd.DataFrame({'PassengerId': df_submit.PassengerId, 'Survived': submit_predictions})

### Export data

In [None]:
my_submission.to_csv('submission.csv', index=False)