In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
gender_submission = pd.read_csv("../input/titanic/gender_submission.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
style.use('fivethirtyeight')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.isnull().sum()

# **1.Analysing & Visualizing data**

**Survived**

In [None]:
plt.figure(figsize=(15,8))
x=train["Survived"].value_counts()
mylabel=["Not survived (0)","Survived(1)"]
colors=['#f4acb7','#9d8189']
plt.pie(x,labels=mylabel,autopct="%1.1f%%",startangle=15,shadow=True,colors=colors)
plt.axis("equal")
plt.show()

Observation: More than 60% of passengers had died.

**Pclass**

In [None]:
plt.figure(figsize=(15,8))
hue_color={0:'#012a4a',1:'#2c7da0'}
Pclass=['class1','class2','class3']
ax=sns.countplot(data=train,x='Pclass',hue='Survived',palette=hue_color)
plt.xticks(ticks = [0,1,2], labels = Pclass)
plt.legend(['Percentage not survived or unknown', 'Percentage of survived'])
plt.show()

Observation: 1.The survived chances of 'Class1' travelers are high than the class2 and class3

2.The unsurvived chances of 'Class3' travelers are high than other classes.

**Sex**

In [None]:
plt.figure(figsize=(15,8))
Y=train["Sex"].value_counts()
mylabel=["Male","Female"]
colors = ['#E63946', '#F1FAEE']
plt.pie(Y,labels=mylabel,autopct="%1.1f%%",startangle=15,shadow=True,colors=colors)
plt.axis("equal")
plt.show()

Observation: Approximately 65% of the tourists were male while the remaining 35% were female

In [None]:
plt.figure(figsize=(15,8))
hue_color={0:'#8D99AE',1:'#ef233c'}
Sex=['Male','Female']
ax=sns.countplot(data=train,x='Sex',hue='Survived',palette=hue_color)
plt.xticks(ticks = [0,1], labels = Sex)
plt.legend(['Percentage not survived or unknown', 'Percentage of survived'])
plt.show()

Oservation: More males were died as compared to females.

**Age**

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(x=train['Survived'],hue=pd.cut(train['Age'],5)) 

Oservation: 1. A larger fraction of children under 16 survived than died.

2.Every other age group, the number of died was higher than the number of survivors.

3.The passengers of age between 16 and 31 were dead as compared to same age range sustained.

**SibSp**

In [None]:
plt.figure(figsize=(15,8))
z=train["SibSp"].value_counts()
mylabel=[0,1,2,3,4,5,8]
colors = ['#ede0d4','#e6ccb2','#ddb892','#b08968','#7f5539','#9c6644','#dda15e']
plt.pie(z,labels=mylabel,autopct="%1.1f%%",startangle=15,shadow=True,colors=colors)
plt.axis("equal")
plt.show()

Oservation: 91% of people traveled alone or with one of their sibling or spouse

In [None]:
plt.figure(figsize=(15,8))
hue_color={0:'#555b6e',1:'#89b0ae'}
ax=sns.countplot(data=train,x='SibSp',hue='Survived',palette=hue_color)
plt.legend(['Percentage not survived or unknown', 'Percentage of survived'])
plt.show()

Observation: Chances of survival dropped drastically if someone traveled with more than 2 siblings or spouse.

**Embarked**

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(x=train['Embarked'],hue=train['Survived'])

Observation: People who boarded from Cherbourg had a higher chance of survival than people who boarded from Southampton or Queenstown.

# **2.Data Cleaning**

**Train**

In [None]:
import re
def split_it(data):
    result = re.search('^.*,(.*)\.\s.*$', data)
    if result.group(1) not in [' Mr', ' Miss', ' Mrs', ' Master']:
        return ' Misc'
    else:
        return result.group(1)

train['Title'] = train['Name'].apply(split_it)

In [None]:
train

In [None]:
train.drop("Cabin",axis=True,inplace=True)
train.drop("Name",axis=True,inplace=True)

In [None]:
train.dropna(inplace=True)

In [None]:
train['Sex'].replace({'male':0,'female':1},inplace=True)
train['Embarked'].replace({'S':1,'C':2,'Q':3},inplace=True)

In [None]:
train.drop(["Ticket","PassengerId","Fare"],axis=1,inplace=True)

In [None]:
train

**Test**

In [None]:
test['Sex'].replace({'male':0,'female':1},inplace=True)
test['Embarked'].replace({'S':1,'C':2,'Q':3},inplace=True)

In [None]:
test['Title'] = test['Name'].apply(split_it)

In [None]:
test.drop(['PassengerId','Name','Ticket','Cabin','Fare'],axis=1,inplace=True)

In [None]:
test

In [None]:
train["Age"].fillna(train.groupby("Title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("Title")["Age"].transform("median"), inplace=True)

In [None]:
test.drop(["Title"],axis=1,inplace=True)
train.drop(["Title"],axis=1,inplace=True)

In [None]:
test

# **3.Model Selection**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X=train.drop(['Survived'],axis='columns')

In [None]:
y=train['Survived']

**Split_Data**

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
print(len(X_train),len(y_train),len(X_test),len(y_test))

**LogisticRegression**

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()

In [None]:
model.fit(X_train,y_train)

In [None]:
predictions=model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print(accuracy_score(y_test, predictions))

**RandomForestClassifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
model_2=RandomForestClassifier()

In [None]:
model_2.fit(X_train,y_train)

In [None]:
predictions_2=model_2.predict(X_test)

In [None]:
print(accuracy_score(y_test, predictions_2))

**DecisionTreeClassifier**

In [None]:
from sklearn import tree
model_3=tree.DecisionTreeClassifier()
model_3.fit(X_train,y_train)

In [None]:
predictions_3=model_3.predict(X_test)
print(accuracy_score(y_test, predictions_3))

# **4.Train Data**

In [None]:
model.fit(X,y)

In [None]:
pred=model.predict(test)

In [None]:
submission=pd.DataFrame({"PassengerId": gender_submission["PassengerId"],"Survived":pred})

In [None]:
submission

In [None]:
submission.to_csv('submission.csv',index=False)