In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#  1. Exploring the training Data

In [None]:
# Load train and test Datasets
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
train.head(10)

In [None]:
#Count the number of Rows and columns
train.shape

In [None]:
#Get some idea about the statistics
train.describe()

In [None]:
#Get the number of survivors
train['Survived'].value_counts()


In [None]:
#Visualize the survivors count
sns.countplot(train['Survived'])

In [None]:
# Visualize the count of survivors for columns 'who', 'sex', 'pclass', 'sibsp', 'parch', and 'embarked'
cols = ['Sex','Sex', 'Pclass', 'SibSp', 'Parch', 'Embarked']

n_rows = 2
n_cols = 3

# The subplot grid and the figure size of each graph
# This returns a Figure (fig) and an Axes Object (axs)
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols*3.2,n_rows*3.2))

for r in range(0,n_rows):
    for c in range(0,n_cols):  
        
        i = r*n_cols+ c #index to go through the number of columns       
        ax = axs[r][c] #Show where to position each subplot
        sns.countplot(train[cols[i]], hue=train["Survived"], ax=ax)
        ax.set_title(cols[i])
        ax.legend(title="Survived", loc='upper right') 
        
plt.tight_layout()   #tight_layout

In [None]:
#Look at survival rate by sex
train.groupby('Sex')[['Survived']].mean()

In [None]:
#Look at survival rate by sex and class
train.pivot_table('Survived', index='Sex', columns='Pclass')

In [None]:
#Plot the survival rate of each class.
sns.barplot(x='Pclass', y='Survived', data=train)

# 2. Data Corrections

In [None]:
#Counting number of missing values
train.isna().sum()

In [None]:
#Filling missing value of training and testing age data with Mean value
data = [train, test]
for dataset in data:
    dataset['Age'].fillna(dataset['Age'].mean(),inplace=True)


In [None]:
#Categorizing Cabin
import re
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
data = [train, test]

for dataset in data:
    dataset['Cabin'] = dataset['Cabin'].fillna("U0")
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna(0)
    dataset['Deck'] = dataset['Deck'].astype(int)
# we can now drop the cabin feature
train = train.drop(['Cabin'], axis=1)
test = test.drop(['Cabin'], axis=1)

In [None]:
train.head()

In [None]:
#After changing Cabin to Deck column
train.isna().sum()

In [None]:
#Embarked has 2 missing values, fill it with the top occuring value
train['Embarked'].describe()

In [None]:
common_value = 'S'
data = [train, test]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

In [None]:
#Latest missing value count
train.isna().sum()

In [None]:
#Checking the features and planning to convert it, everything should be integer
train.info()

In [None]:
#Converting Fare float to integer
data = [train, test]

for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
#Converting Embarked to integer
ports = {"S": 0, "C": 1, "Q": 2}
data = [train, test]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(ports)
    

In [None]:
#Removing Tickets since there are lot of unique values
train = train.drop(['Ticket'], axis=1)
test = test.drop(['Ticket'], axis=1)

In [None]:
#Checking latest Dtype
train.info()

In [None]:
#Converting Sex into integer value
gender = {"male": 0, "female": 1}
data = [train, test]

for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(gender)

In [None]:
#Converting Age to int category since it is float 
data = [train, test]

for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)

In [None]:
#Checking the latest info
train.info()

In [None]:
#Removing Name since survival has nothing to do with name
train = train.drop(['Name'], axis=1)
test = test.drop(['Name'], axis=1)

In [None]:
train.info()

# 3 Creating Categories

In [None]:

a=0
b=0
c=0
d=0
e=0
f=0
g=0
h=0
for dataset in train['Age']:
    if (dataset <=11):
        a+=1
    elif (dataset >11) & (dataset <=18):
        b+=1
    elif (dataset >18) & (dataset <=22):
        c+=1
    elif (dataset >22) & (dataset <=27):
        d+=1
    elif (dataset >27) & (dataset <=33):
        e+=1
    elif (dataset >33) & (dataset <=40):
        f+=1
    elif (dataset >40) & (dataset <=66):
        g+=1
    elif (dataset >66):
        h+=1
"""
for dataset in test['Age']:
    if (dataset <=11):
        a+=1
    elif (dataset >11) & (dataset <=18):
        b+=1
    elif (dataset >18) & (dataset <=22):
        c+=1
    elif (dataset >22) & (dataset <=27):
        d+=1
    elif (dataset >27) & (dataset <=33):
        e+=1
    elif (dataset >33) & (dataset <=40):
        f+=1
    elif (dataset >40) & (dataset <=66):
        g+=1
    elif (dataset >66):
        h+=1
"""
print("less than 11,  a ",a)
print("btw 11 and 18, b ",b)
print("btw 18 and 22, c ",c)
print("btw 22 and 27, d ",d)
print("btw 27 and 33, e ",e)
print("btw 33 and 40, f ",f)
print("btw 40 and 66, g ",g)
print("grt than 66,   h ",h)


In [None]:
#Categorizing Age into different set
data = [train, test]
for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 66, 'Age'] = 6

# let's see how it's distributed train_df['Age'].value_counts()

In [None]:
train['Age'].value_counts()

In [None]:
#Categorizing Fare
data = [train, test]

for dataset in data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare']   = 4
    dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
train.head()

# 4 Creating new Features ( Optional )

In [None]:
#Age Time Class
data = [train, test]
for dataset in data:
    dataset['Age_Class']= dataset['Age']* dataset['Pclass']

In [None]:
data = [train, test]
for dataset in data:
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0
    dataset.loc[dataset['relatives'] == 0, 'not_alone'] = 1
    dataset['not_alone'] = dataset['not_alone'].astype(int)
train['not_alone'].value_counts()

In [None]:
#Fare per person class
data = [train, test]
for dataset in data:
    dataset['Fare_Per_Person'] = dataset['Fare']/(dataset['relatives']+1)
    dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int)
# Let's take a last look at the training set, before we start training the models.
train.head(10)

# 5 Model building

In [None]:
test.head()

In [None]:
X_train = train.drop("Survived", axis=1)
Y_train = train["Survived"]
X_test  = test.copy()

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

In [None]:
results = pd.DataFrame({
    'Model': ['Random Forest'],
    'Score': [acc_random_forest]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

# 5 Working with Test Data Set

In [None]:
#Now work with the test data
test.describe()

In [None]:
#Creating submission file
submission = pd.DataFrame({"PassengerId":test['PassengerId'],
                            "Survived":Y_prediction})

In [None]:
submission.to_csv('TitanicSubmission_V05.csv', index=False)