In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

**Loading Dataset**

In [None]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')


**Looking into training data**

In [None]:
train.head()


In [None]:
train.shape

In [None]:
train.describe()

In [None]:
train.describe(include=['O'])

In [None]:
train.isnull().sum()

**Looking into testing data**

In [None]:
test.shape

In [None]:
test.head()

In [None]:
test.info()

In [None]:
test.isnull().sum()

**Relationship between different features and Survival**

In [None]:
survived = train[train['Survived'] == 1]
not_survived = train[train['Survived'] == 0]

In [None]:
print ("Survived: {}({:.1f}%)".format(len(survived), float(len(survived)/len(train)*100.0)))
print ("Not Survived: {}({:.1f}%)".format(len(not_survived), float(len(not_survived)/len(train)*100.0)))
print ("Total:{}".format(len(train)))

**Pclass v/s Survival**

In [None]:
train.Pclass.value_counts()

In [None]:
train.groupby('Pclass').Survived.value_counts()

In [None]:
train[['Pclass', 'Survived']].groupby('Pclass',as_index=False).Survived.mean()

In [None]:
sns.barplot(x='Pclass', y='Survived', data=train)

**Sex v/s Survival**

In [None]:
train.Sex.value_counts()

In [None]:
train.groupby('Sex').Survived.value_counts()

In [None]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

In [None]:
sns.barplot(x='Sex', y='Survived', data=train)


**Sex and Pclass v/s Survival**

In [None]:
tab = pd.crosstab(train['Pclass'], train['Sex'])
print (tab)

In [None]:
tab.div(tab.sum(1).astype(float), axis=0).plot(kind="bar", stacked=True)
plt.xlabel('Pclass')
plt.ylabel('Percentage')


In [None]:
sns.catplot(x='Sex', y='Survived', hue='Pclass', height=4, aspect=2, data=train)

**Pclass, Sex and Embarked v/s Survival**

In [None]:
sns.catplot(x='Pclass', y='Survived', hue='Sex', col='Embarked', data=train)

**Embarked v/s Survival**

In [None]:
train.Embarked.value_counts()

In [None]:
train.groupby('Embarked').Survived.value_counts()


In [None]:
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()

In [None]:
sns.barplot(x='Embarked', y='Survived', data=train)


**Parch v/s Survival**

In [None]:
train.Parch.value_counts()

In [None]:
train.groupby('Parch').Survived.value_counts()



In [None]:
train[['Parch','Survived']].groupby('Parch',as_index=False).mean()

In [None]:
sns.barplot(x='Parch', y='Survived', ci=None, data=train)

**SibSp v/s Survived**

In [None]:
train.SibSp.value_counts()

In [None]:
train.groupby('SibSp').Survived.value_counts()

In [None]:
train[['SibSp','Survived']].groupby('SibSp',as_index=False).mean()

In [None]:
sns.barplot(x='SibSp',y='Survived',data=train)

In [None]:
train.Age.value_counts()

In [None]:
train.groupby('SibSp').Survived.value_counts()

In [None]:
sns.barplot(x='Age',y='Survived',data=train)

In [None]:
fig = plt.figure(figsize=(15,5))
ax1 = fig.add_subplot(131)
ax2 = fig.add_subplot(132)
ax3 = fig.add_subplot(133)
sns.violinplot(x="Embarked", y="Age", hue="Survived", data=train, split=True, ax=ax1)
sns.violinplot(x="Pclass", y="Age", hue="Survived", data=train, split=True, ax=ax2)
sns.violinplot(x="Sex", y="Age", hue="Survived", data=train, split=True, ax=ax3)

**Feature Extraction**

In [None]:
titles=set()
for name in train['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip(' '))
print(titles)  

In [None]:
title_dict={'Mrs':'Mrs','Major':'Other','Master':'Master','Lady':'Other','Mlle':'Miss','Dr':'Other','Col':'Other','Capt':'Other','Don':'Other','the Countess':'Other','Mme':'Mrs','Miss':'Miss','Jonkheer':'Other','Rev':'Other','Sir':'Other','Ms':'Miss','Mr':'Mr'}

In [None]:
train_test=[train,test]

In [None]:
for dataset in train_test:
    dataset['Title']=dataset['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip() )
    dataset['Title']=dataset.Title.map(title_dict)  

In [None]:
train.head()

In [None]:
train.tail()

In [None]:
pd.crosstab(train['Title'],train['Sex'])

In [None]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5}

In [None]:
for dataset in train_test:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    
train.head()

In [None]:
for dataset in train_test:
    train['Title']=train['Title'].fillna(0)

In [None]:
train.head()

**Cleaning the dataset**

In [None]:
df1=train.drop(['Name','Ticket','PassengerId','Cabin','Embarked'],axis=1)
df1.head()

In [None]:
df1['Sex']=df1.Sex.map({'female':1,'male':0})

In [None]:
df1.head()

In [None]:
df1.isnull().sum()

In [None]:
mean_men=df1[df1['Sex']==0]['Age'].mean()
mean_fem=df1[df1['Sex']==1]['Age'].mean()

In [None]:
df1.loc[(df1.Age.isnull())&(df1['Sex']==1),'Age']=mean_fem
df1.loc[(df1.Age.isnull())&(df1['Sex']==0),'Age']=mean_men

In [None]:
df1.isnull().sum()

**Feature Scaling**

In [None]:
df1.Age=(df1.Age-min(df1.Age))/(max(df1.Age)-min(df1.Age))
df1.Fare=(df1.Fare-min(df1.Fare))/(max(df1.Fare)-min(df1.Fare))

In [None]:
df1.head()

**Data Modelling**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(df1.drop(['Survived'],axis=1),df1.Survived,test_size=0.25,random_state=0,stratify=df1.Survived)

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
clf= LogisticRegression()
clf.fit(x_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_pred= clf.predict(x_test)
accuracy_score (y_test,y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
mat=confusion_matrix(y_test,y_pred)
mat

In [None]:
sns.heatmap(mat,annot=True)

**Cleaning the test dataset**

In [None]:
test.head()

In [None]:
df2=test.drop(['Name','Ticket','PassengerId','Cabin','Embarked'],axis=1)
df2.head()

In [None]:
df2['Sex']=df2.Sex.map({'male':0,'female':1})

In [None]:
df2.head()

In [None]:
df2.isnull().sum()

In [None]:
Mean_fem=df2[df2['Sex']==1]['Age'].mean()
Mean_men=df2[df2['Sex']==0]['Age'].mean()

In [None]:
df2.loc[(df2.Age.isnull())&(df2['Sex']==1),'Age']=Mean_fem
df2.loc[(df2.Age.isnull())&(df2['Sex']==0),'Age']=Mean_men

In [None]:
df2.isnull().sum()

In [None]:
df2[df2.Title.isnull()]

In [None]:
df2['Title']=df2.Title.fillna(3)  #filling with value corresponding to Mrs since age is 39

In [None]:
df2.isnull().sum()

In [None]:
df2['Fare']=df2.Fare.fillna(df2.Fare.median())

In [None]:
df2.isnull().sum()

In [None]:
df2.head()

In [None]:
df2['Age']=(df2.Age-min(df2.Age))/(max(df2.Age)-min(df2.Age))

In [None]:
df2['Fare']=(df2.Fare-min(df2.Fare))/(max(df2.Fare)-min(df2.Fare))

In [None]:
df2.head()

**Prediction**

In [None]:
pred=clf.predict(df2)

In [None]:
pred

In [None]:
submit=pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':pred})
submit.to_csv('submit.csv',index=False)

In [None]:
pred_df=pd.read_csv('submit.csv')

In [None]:
sns.countplot(x='Survived',hue='Survived',data=pred_df)