**Titanic Machine Learning Using Logistic Regression **

> 1. Import Libraries
> 2. Load Data
> 3. Visualization Of Data
> 4. Filling the Missing Values
> 5. Feature Engineering
> 6. Visualization by the Graph
> 7. Convert To Categorical
> 8. Prepare Train And Test Data
> 9. Run the Model

**1. Import Libraries**

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score


from sklearn.preprocessing import OneHotEncoder
from sklearn import model_selection
from sklearn import metrics
import pylab as pl
import random  
import warnings
warnings.filterwarnings('ignore')

**2. Load Data**

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [1]:
traind = pd.read_csv('../input/titanic/train.csv')

In [1]:
testd = pd.read_csv('../input/titanic/test.csv')

In [1]:
testd.shape

In [1]:
traind.shape

**3. Visualization Of Data**

In [1]:
traind.head()

In [1]:
testd.head()

In [1]:
# to know the total non-null values in each columns
traind.info()

In [1]:
# to know the total null values in each columns
print('Train columns with null values : {} \n '.format(traind.isnull().sum()))
print('Test columns with null values : {} '.format(testd.isnull().sum()))

**4. Filling the Missing Values**

In [1]:
traind.describe()

In [1]:
testd.describe()

In [1]:
traind['Age'].fillna(traind['Age'].median(), inplace = True)
traind['Embarked'].fillna(traind['Embarked'].mode()[0], inplace = True)
traind['Fare'].fillna(traind['Fare'].median(), inplace = True)

testd['Age'].fillna(testd['Age'].median(), inplace = True)
testd['Embarked'].fillna(testd['Embarked'].mode()[0], inplace = True)
testd['Fare'].fillna(testd['Fare'].median(), inplace = True)

In [1]:
print('Train columns with null values : {} \n'.format(traind.isnull().sum()))
print('Test columns with null values : {} \n'.format(testd.isnull().sum()))

In [1]:
# Now deleting the columns from tables
drop_columns = ['Ticket', 'Cabin']
traind.drop(drop_columns, axis = 1, inplace = True)
testd.drop(drop_columns, axis = 1, inplace = True)

In [1]:
traind.head()

In [1]:
testd.head()

**5. Feature Engineering**

In [1]:
alltables = [traind,testd]
for test in alltables:
    test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
    
    test['IsAlone'] = 1
    test['IsAlone'].loc[test['FamilySize'] > 1] = 0
     
    test['Title'] = test['Name'].str.split(', ', expand = True)[1].str.split('.',expand = True)[0]
    
    test['AgeBand'] = pd.cut(test['Age'], 5)
    test['AgeBand'] = pd.cut(test['Age'], 5)

    test['AgeBand'] = test['AgeBand'].astype('category').cat.codes
    test['AgeBand'] = test['AgeBand'].astype('category').cat.codes
stat_min = 10
title_names = (traind['Title'].value_counts() < stat_min)
    
traind['Title'] = traind['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
print(traind['Title'].value_counts())
print("----------")
    
traind.info()
testd.info()
traind.head()

**6. Visualization by the Graph**

In [1]:
plt.title('1st Graph')
sns.countplot(x = 'Survived', data = traind)

In [1]:
fig, seas = plt.subplots(2, 2, figsize = (16,12))

sns.countplot(x = 'Survived', hue = 'Embarked', data = traind, ax = seas[0,0])
sns.countplot(x = 'Survived', hue = 'IsAlone', data = traind, ax = seas[0,1])
sns.countplot(x = 'Survived', hue = 'Pclass', data = traind, ax = seas[1,0])
sns.countplot(x = 'Survived', hue = 'Sex', data = traind, ax = seas[1,1])

In [1]:
fig, (axis1, axis2, axis3) = plt.subplots(1, 3, figsize = (14,12))

sns.boxplot(x = 'Pclass',y = 'Fare', hue = 'Survived', data = traind, ax = axis1)
axis1.set_title('Pclass VS. Fare')
sns.violinplot(x = 'Pclass',y = 'Age', hue = 'Survived', data = traind, split = True, ax = axis2)
axis2.set_title('Pclass VS. Age')
sns.boxplot(x = 'Pclass',y = 'FamilySize', hue = 'Survived', data = traind, ax = axis3)
axis3.set_title('Pclass VS. FamilySize')

In [1]:
a = sns.FacetGrid(traind, hue = 'Survived', aspect=4)
a.map(sns.kdeplot, 'Age',shade= True )
a.set(xlim=(0 ,traind['Age'].max()))
a.add_legend()

In [1]:
# to know the correlation between each columns
plt.subplots(figsize = (14, 12))
correlation = traind.corr()
sns.heatmap(correlation, annot = True, cmap = 'coolwarm')

**7. Convert To Categorical**

In [1]:

Sex_dummy = pd.get_dummies(traind['Sex'],prefix="Sex")
traind = pd.concat([traind,Sex_dummy],axis=1)
traind.drop("Sex",axis=1,inplace=True)

Embarked_dummy = pd.get_dummies(traind['Embarked'],prefix="Embarked")
traind = pd.concat([traind,Embarked_dummy],axis=1)
traind.drop("Embarked",axis=1,inplace=True)

Title_dummy = pd.get_dummies(traind['Title'],prefix="Title")
traind = pd.concat([traind,Title_dummy],axis=1)
traind.drop("Title",axis=1,inplace=True)

AgeBand_dummy = pd.get_dummies(traind['AgeBand'],prefix="AgeBand")
traind = pd.concat([traind,AgeBand_dummy],axis=1)
traind.drop("AgeBand",axis=1,inplace=True)

FamilySize_dummy = pd.get_dummies(traind['FamilySize'],prefix="FamilySize")
traind = pd.concat([traind,FamilySize_dummy],axis=1)
traind.drop("FamilySize",axis=1,inplace=True)

Pclass_dummy = pd.get_dummies(traind['Pclass'],prefix="Pclass")
traind = pd.concat([traind,Pclass_dummy],axis=1)
traind.drop("Pclass",axis=1,inplace=True)

Target = ['Survived']
drop_columns = ['Name','Age','Fare']
traind.drop(drop_columns, axis = 1, inplace = True)
testd.drop(drop_columns, axis = 1, inplace = True)

    

In [1]:
traind.head()

In [1]:
Sex_dummy = pd.get_dummies(testd['Sex'],prefix="Sex")
testd = pd.concat([testd,Sex_dummy],axis=1)
testd.drop("Sex",axis=1,inplace=True)

Embarked_dummy = pd.get_dummies(testd['Embarked'],prefix="Embarked")
testd = pd.concat([testd,Embarked_dummy],axis=1)
testd.drop("Embarked",axis=1,inplace=True)

Title_dummy = pd.get_dummies(testd['Title'],prefix="Title")
testd = pd.concat([testd,Title_dummy],axis=1)
testd.drop("Title",axis=1,inplace=True)

AgeBand_dummy = pd.get_dummies(testd['AgeBand'],prefix="AgeBand")
testd = pd.concat([testd,AgeBand_dummy],axis=1)
testd.drop("AgeBand",axis=1,inplace=True)

FamilySize_dummy = pd.get_dummies(testd['FamilySize'],prefix="FamilySize")
testd = pd.concat([testd,FamilySize_dummy],axis=1)
testd.drop("FamilySize",axis=1,inplace=True)

Pclass_dummy = pd.get_dummies(testd['Pclass'],prefix="Pclass")
testd = pd.concat([testd,Pclass_dummy],axis=1)
testd.drop("Pclass",axis=1,inplace=True)


In [1]:
testd.head()

In [1]:
#After converting to categorical testd contains four extra column so in next step those columns will remove from testd
traind.info()
testd.info()

In [1]:
drop_col = ['Title_Col' ,'Title_Dona' ,'Title_Dr','Title_Rev']
testd.drop(drop_col,axis = 1,inplace=True)

**8. Prepare Train And Test Data**

In [1]:
X_t = traind.drop(['Survived'], axis = 1)
Y_t = traind['Survived']

In [1]:
x_train, x_test, y_train, y_test = train_test_split(X_t ,Y_t, test_size = 0.3)

In [1]:
x_train.shape

In [1]:
x_test.shape

In [1]:
y_train.shape

In [1]:
y_test.shape

**9. Run the Model**

In [1]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
Model = LogisticRegression()
history = Model.fit(x_train, y_train)
train_acc = accuracy_score(y_train, Model.predict(x_train))
test_acc = accuracy_score(y_test, Model.predict(x_test))
print('train accuracy: '+str(train_acc))
print('test accuracy: '+str(test_acc))


In [1]:
y_predict = Model.predict(testd)

In [1]:
out = pd.DataFrame({'PassengerId': testd['PassengerId'], 'Survived': y_predict})
out.to_csv('submission.csv', index=False,header=True)