Loading Modules

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Exploratory Data Analysis

In [None]:
train = pd.read_csv('../input/titanic/train.csv')


Looking into training dataset

In [None]:
train.head()

In [None]:
train.shape


In [None]:
train.columns

In [None]:
train.isnull().sum()

Looking into testing dataset

In [None]:
train['Survived'].value_counts()

In [None]:
sns.countplot(x='Survived',data=train)

In [None]:
train['Sex'].value_counts()

In [None]:
sns.countplot(x='Survived',hue='Sex',data=train,palette='winter')

In [None]:
train['Pclass'].value_counts()

In [None]:
sns.countplot(x='Survived',hue='Pclass',data=train,palette='PuBu')

In [None]:
sns.distplot(train['Age'],kde=False)

In [None]:
sns.countplot(x='SibSp',data=train,palette='rocket')

In [None]:
train['Parch'].plot.hist()

Cleaning the train data

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
sns.boxplot(x='Pclass',y='Age',data=train)

In [None]:
print(train[train['Pclass']==1]['Age'].mean())
print(train[train['Pclass']==2]['Age'].mean())
print(train[train['Pclass']==3]['Age'].mean())

In [None]:
def fill_in_na_values(cols):
    age=cols[0]
    pclass=cols[1]
    if pd.isnull(age):
        if pclass ==1:
            return round(train[train['Pclass']==1]['Age'].mean())
        elif pclass ==2:
            return round(train[train['Pclass']==2]['Age'].mean())
        elif pclass==3:
            return round(train[train['Pclass']==3]['Age'].mean())
    else:
            return age
train['Age']=train[['Age','Pclass']].apply(fill_in_na_values,axis=1) 

In [None]:
train.isnull().sum()

In [None]:
sns.heatmap(train.isnull())

In [None]:
train.head()

In [None]:
train.drop(['Cabin'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
train.isnull().sum()

In [None]:
train.dropna(inplace=True)

In [None]:
train.isnull().sum()

In [None]:
train.head()

In [None]:
train.drop(['PassengerId','Name','Ticket'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
train['Sex'].unique()

In [None]:
train['Embarked'].unique()

In [None]:
train.Sex=train.Sex.map({'female':0, 'male':1})
train.Embarked=train.Embarked.map({'S':0, 'C':1, 'Q':2,'nan':'NaN'})

In [None]:
train.head()

In [None]:
#Scaling the data
train.Age = (train.Age-min(train.Age))/(max(train.Age)-min(train.Age))
train.Fare = (train.Fare-min(train.Fare))/(max(train.Fare)-min(train.Fare))

In [None]:
train.head()

Modeling the data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train.drop(['Survived'], axis=1),
    train.Survived,
    test_size= 0.2,
    random_state=0,
    stratify=train.Survived
)

In [None]:
# Logistic regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

Y_pred = clf.predict(X_test)
accuracy_score(y_test, Y_pred)

Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test, Y_pred)
cm

In [None]:
sns.heatmap(cm,annot=True)

Loading and Cleaning test dataset

In [None]:
test=pd.read_csv('../input/titanic/test.csv')
test.head()

In [None]:
test.shape

In [None]:
test.info()

In [None]:
test.isnull().sum()

Relationship between Features and Survival

In [None]:
def fill_in_na_values(cols):
    age=cols[0]
    pclass=cols[1]
    if pd.isnull(age):
        if pclass ==1:
            return round(test[test['Pclass']==1]['Age'].mean())
        elif pclass ==2:
            return round(test[test['Pclass']==2]['Age'].mean())
        elif pclass==3:
            return round(test[test['Pclass']==3]['Age'].mean())
    else:
            return age
test['Age']=test[['Age','Pclass']].apply(fill_in_na_values,axis=1)      

In [None]:
test.head()

In [None]:
test.isnull().sum()

In [None]:
test.drop(['Cabin'],axis=1,inplace=True)

In [None]:
test.isnull().sum()

In [None]:
test['Fare']=test['Fare'].fillna(test['Fare'].median())

In [None]:
df=test.drop(['PassengerId','Name','Ticket'],axis=1)

In [None]:
df.head()

In [None]:
df.Sex=df.Sex.map({'female':0, 'male':1})
df.Embarked=df.Embarked.map({'S':0, 'C':1, 'Q':2,'nan':'NaN'})

In [None]:
df.Age = (df.Age-min(df.Age))/(max(df.Age)-min(df.Age))
df.Fare = (df.Fare-min(df.Fare))/(max(df.Fare)-min(df.Fare))

In [None]:
df.head()

In [None]:
pred = clf.predict(df)

In [None]:
pred

In [None]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": pred
    })
submission.to_csv('submission.csv', index=False)

In [None]:
pred_df = pd.read_csv('submission.csv')

In [None]:
sns.countplot(x='Survived', data=pred_df)