In [None]:
#Packages in use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#reading the data
train=pd.read_csv('../input/titanic/train.csv')
test=pd.read_csv('../input/titanic/test.csv')

In [None]:
display(train.head())
print('**************************************************************************************************')
display(test.head())

In [None]:
#defining functions to concat and devide train and test dataset
#to avoid doing data cleaning and manupulation twice

def conct(train,test):
    return pd.concat([train,test],sort=True).reset_index(drop=True)

def devide(df):
    return df.loc[:890],df.loc[891:].drop(['Survived'],axis=1)

In [None]:
df=conct(train,test)

## EDA

In [None]:
# No of rows and columns in train dataset
train.shape

In [None]:
# No of rows and columns in test dataset
test.shape

In [None]:
# null values preseent in train dataset
train.isnull().sum()

In [None]:
# null values present in test dataset 
test.isnull().sum()

# Cleaning data

### Dealing with the null values

#### AGE

In [None]:
#Age
#Age varies with pclass
#people with higher passenger class are older than that of people in lower passenger class
#Also age varies with sex
#grouping age with pclass and sex and finding the median to fill the missing values

In [None]:
train.groupby(['Pclass','Sex'])['Age'].median()

In [None]:
df['Age']=df.groupby(['Pclass','Sex'])['Age'].apply(lambda x: x.fillna(x.median()))

### FARE

In [None]:
#Fare(Present in the test data set)
df.loc[df['Fare'].isnull()]

In [None]:
# Passenger class 3, embarked S,travelled alone
#fill the value with median fare of passengers falling under
#the same criteria
missingFare=df.loc[(df['Pclass']==3)&(df['Embarked']=="S")&(df['SibSp']==0)]['Fare'].median()

In [None]:
missingFare
df.loc[df['Fare'].isnull(),'Fare']=missingFare

### CABIN

In [None]:
# the cabin variable has a lot of missing variables 
#but it is an imprtant predictor when considering the
#Structure of the titanic ship
#hence we use cabin to create a new variable "deck" from cabin
#The misssing values are given 'M'

In [None]:
#keep all first letteres of cabin 
df['Deck']=df['Cabin'].apply(lambda s: s[0] if pd.notnull(s) else 'M')

In [None]:
df[['Deck','Survived']].groupby('Deck')['Survived'].mean().plot(kind='bar',figsize=(15,7))

In [None]:
#grouping the deccks as ABC,DE,FG and M
df[df['Deck']=='T']

In [None]:
df.loc[339,'Deck']='A'
df['Deck']=df['Deck'].replace(['A','B','C'],'ABC')
df['Deck']=df['Deck'].replace(['D','E'],'DE')
df['Deck']=df['Deck'].replace(['F','G'],'FG')

df['Deck'].value_counts()

### Embarked

In [None]:
# the 2 null values in embarked are found out to be 'S'
# By searchin their respective names on google
df.loc[df['Embarked'].isnull(),"Embarked"]='S'

In [None]:
df.isnull().sum()# all null values are well dealt with

# Feature Engineering

In [None]:
# creating a new column Title from name which shows the socio economic status of an individual
titles = set()
for name in train['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())
print(titles)

In [None]:
Title_Dictionary = {"Capt": "Officer","Col": "Officer","Major": "Officer","Jonkheer": "Royalty","Don": "Royalty","Sir" : "Royalty","Dr": "Officer","Rev": "Officer","the Countess":"Royalty","Mme": "Mrs","Mlle": "Miss","Ms": "Mrs","Mr" : "Mr","Mrs" : "Mrs","Miss" : "Miss","Master" : "Master","Lady" : "Royalty"}

In [None]:
df['Title'] = df['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
df['Title'] = df.Title.map(Title_Dictionary)
df.head()

In [None]:
df1,df2=devide(df)# deviding the datasets

In [None]:
df1=df1.drop(['Name','Ticket','Cabin','PassengerId'], axis=1)
df1.head()

In [None]:
#converting categorical features into numeric values
df1.Sex=df1.Sex.map({'female':0, 'male':1})
df1.Embarked=df1.Embarked.map({'S':0, 'C':1, 'Q':2,})
df1.Title=df1.Title.map({'Mr':0, 'Miss':1, 'Mrs':2,'Master':3,'Officer':4,'Royalty':5})
df1.Deck=df1.Deck.map({'FG':0,'DE':1,'ABC':2,'M':3})

In [None]:
df1.head()

In [None]:
df1.isnull().sum()# Data is cleaned

# Feature scaling

In [None]:
df1.Age = (df1.Age-min(df1.Age))/(max(df1.Age)-min(df1.Age))
df1.Fare = (df1.Fare-min(df1.Fare))/(max(df1.Fare)-min(df1.Fare))

In [None]:
df1.describe()

# Data modelling

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df1.drop(['Survived'], axis=1),
    df1.Survived,
    test_size= 0.2,
    random_state=0,
    stratify=df1.Survived
)

In [None]:
# Logistic regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

Y_pred = clf.predict(X_test)
accuracy_score(y_test, Y_pred)

# Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, Y_pred)
cm

In [None]:
sns.heatmap(cm,annot=True)

In [None]:
#Test data set
df2.head()

In [None]:
titles = set()
for name in df2['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())
print(titles)

In [None]:
df2['Title'] = df2['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
df2['Title'] = df2.Title.map(Title_Dictionary)
df2.head()

In [None]:
# dropping unwanted columns
df2=df2.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

In [None]:
# Converting categorical feature to numeric
df2.Sex=df2.Sex.map({'female':0, 'male':1})
df2.Embarked=df2.Embarked.map({'S':0, 'C':1, 'Q':2,'nan':'nan'})
df2.Title=df2.Title.map({'Mr':0, 'Miss':1, 'Mrs':2,'Master':3,'Officer':4,'Royalty':5})
df2.Deck=df2.Deck.map({'FG':0,'DE':1,'ABC':2,'M':3})
df2.head()

In [None]:
# Checking for null values
df2.isnull().sum()

In [None]:
# Null value in the title column
df2[df2.Title.isnull()]

In [None]:
df2=df2.fillna(2)

In [None]:
# Data is cleaned to have no null value
df2.isnull().sum()

In [None]:
# cleaned dataset
df2.head()

In [None]:
# feature scaling
df2.Age = (df2.Age-min(df2.Age))/(max(df2.Age)-min(df2.Age))
df2.Fare = (df2.Fare-min(df2.Fare))/(max(df2.Fare)-min(df2.Fare))

In [None]:
# test dataset
df2.head()

# Prediction

In [None]:
pred = clf.predict(df2)

In [None]:
pred

In [None]:
pred1=pred.astype(int)

In [None]:
pred1

In [None]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": pred1
    })


In [None]:
# visualizing predicted values
sns.countplot(x='Survived', data=submission)