In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


## loading dataset

In [None]:
train=pd.read_csv('../input/titanic/train.csv') # reading train data
test=pd.read_csv('../input/titanic/test.csv') # reading test data
train

In [None]:
train.head() # to print first 5 rows of the data set

In [None]:
train.shape # to print number of rows and columns

In [None]:
train.columns # to get the column names

In [None]:
train.describe() 

In [None]:
train.isnull().sum() # number of null values in dataset

In [None]:
test.shape

In [None]:
test.head()

In [None]:
test.isnull().sum()

In [None]:
train.Sex.value_counts()

In [None]:
train.groupby('Sex').Survived.value_counts()

In [None]:
sns.barplot(x='Sex', y='Survived', data=train)

In [None]:
train['Embarked'].value_counts()

In [None]:
train.groupby('Embarked').Survived.value_counts()

In [None]:
sns.countplot(x='Embarked', data=train)

In [None]:
train['Pclass'].value_counts()

In [None]:
sns.countplot(x='Pclass', data=train)

In [None]:
train['SibSp'].value_counts()

In [None]:
train.groupby('SibSp').Survived.value_counts()

In [None]:
sns.countplot(x='SibSp', data=train)

In [None]:
sns.countplot(x='Survived',hue='Pclass',data=train) # survival based on passenger class

In [None]:
sns.distplot(train['Age'].dropna(),kde=False,bins=30)

In [None]:
sns.countplot(x='SibSp',data=train) # count based on sibling or spouse

In [None]:
train['Fare'].hist(bins=40,figsize=(10,4))

In [None]:
sns.countplot(x='Parch',data=train)

In [None]:
train['Died'] = 1 - train['Survived']

In [None]:
train.groupby('Sex').agg('mean')[['Survived', 'Died']].plot(kind='bar', figsize=(10, 5),stacked=True)

Women are more likely to survive.

In [None]:
sns.barplot(x='Fare', y='Survived', data=train)

In [None]:
sns.barplot(x='SibSp',y='Survived',data=train)

## feature extraction

In [None]:
titles=set()
for name in train['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())
print(titles)

In [None]:
titles_dict={'Mrs':'Mrs','Major':'Other','Master':'Master','Lady':'Other','Mlle':'Miss','Dr':'Other','Col':'Other','Capt':'Other','Don':'Other','the Countess':'Other','Mme':'Mrs','Miss':'Miss','Jonkheer':'Other','Rev':'Other','Sir':'Other','Ms':'Miss','Mr':'Mr'}


In [None]:
train['Title'] = train['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
train['Title'] = train.Title.map(titles_dict)
train.head()

In [None]:
df1=train.drop(['Name','Ticket','Cabin','PassengerId','Died'], axis=1)
df1.head()

In [None]:
df1.Sex=df1.Sex.map({'female':0, 'male':1})
df1.Title=df1.Title.map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5})
df1.Embarked=df1.Embarked.map({'S':0, 'C':1, 'Q':2,'nan':'NaN'})

In [None]:
df1

In [None]:
train.Title.value_counts()

In [None]:
mean_men=df1[df1['Sex']==0]['Age'].mean()
mean_female=df1[df1['Sex']==1]['Age'].mean()

In [None]:
df1.loc[(df1.Age.isnull())&(df1['Sex']==1),'Age']=mean_female
df1.loc[(df1.Age.isnull())&(df1['Sex']==0),'Age']=mean_men

In [None]:
df1.dropna(inplace=True)

In [None]:
df1.isnull().sum()

## feature scaling

In [None]:
df1.Age=(df1.Age-min(df1.Age))/(max(df1.Age)-min(df1.Age))
df1.Fare=(df1.Fare-min(df1.Fare))/(max(df1.Fare)-min(df1.Fare))

In [None]:
df1.head()

## Data modelling

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(df1.drop(['Survived'],axis=1),df1.Survived,test_size=0.25,random_state=0,stratify=df1.Survived)

## Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf= LogisticRegression()
clf.fit(x_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_pred= clf.predict(x_test)
accuracy_score (y_test,y_pred)


In [None]:
from sklearn.metrics import confusion_matrix
matrix=confusion_matrix(y_test, y_pred)
matrix

In [None]:
sns.heatmap(matrix,annot=True)

## Cleaning test dataset

In [None]:
test.info()

In [None]:
test.isnull().sum()

In [None]:
test.head()

In [None]:
titles= set()
for name in test['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())
print(titles)

In [None]:
titles_dict

In [None]:
test['Title'] = test['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
test['Title'] = test.Title.map(titles_dict)
test.head()

In [None]:
df2=test.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

In [None]:
df2.Sex=df2.Sex.map({'female':0, 'male':1})
df2.Embarked=df2.Embarked.map({'S':0, 'C':1, 'Q':2,'nan':'Nan'})
df2.Title=df2.Title.map({"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Other": 5})


In [None]:
df2.head()

In [None]:
df2.isnull().sum()

In [None]:
mean_female=df2[df2['Sex']==1]['Age'].mean()
mean_men=df2[df2['Sex']==0]['Age'].mean()

In [None]:
df2.loc[(df2.Age.isnull())&(df2['Sex']==1),'Age']=mean_female
df2.loc[(df2.Age.isnull())&(df2['Sex']==0),'Age']=mean_men

In [None]:
df2.isnull().sum()

In [None]:
df2['Fare']=df2['Fare'].fillna(df2['Fare'].mean())

In [None]:
df2.isnull().sum()

In [None]:
df2[df2.Title.isnull()]

In [None]:
df2['Title']=df2.Title.fillna(3)


In [None]:
df2.isnull().sum()

In [None]:
df2.head()

In [None]:
df2['Age']=(df2.Age-min(df2.Age))/(max(df2.Age)-min(df2.Age))

In [None]:
df2['Fare']=(df2.Fare-min(df2.Fare))/(max(df2.Fare)-min(df2.Fare))

In [None]:
df2.head()

## Prediction

In [None]:
pred=clf.predict(df2)

In [None]:
pred

In [None]:
submit=pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':pred})
submit.to_csv('./submission.csv',index=False)

In [None]:
pred_df = pd.read_csv('./submission.csv')

In [None]:
sns.countplot(x='Survived', data=pred_df)