In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv


In [2]:
#Importing supporting libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Loading the training dataset
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
#Loading the testing dataset
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
#Analyze the available features
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
#Dropping the unimportant features from the dataset
df_train_2 = df_train.drop(columns = ['PassengerId','Name','Ticket','Cabin'])
df_train_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [7]:
#There are few categorical features in the dataset. We need to convert these features into numerics before applying
#the model
#We are going to handle these categorical features : Pclass, Sex, Embarked, using pandas get_dummies() method
#Since pandas get_dummies() only accepts 'object' data type, we first need to convert Pclass into object
df_train_2['Pclass'] = df_train_2['Pclass'].astype('object')
df_train_final = pd.get_dummies(df_train_2,drop_first=True) 

In [8]:
#Check the final dataset after converting object features into numerics
df_train_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Survived      891 non-null int64
Age           714 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Pclass_2      891 non-null uint8
Pclass_3      891 non-null uint8
Sex_male      891 non-null uint8
Embarked_Q    891 non-null uint8
Embarked_S    891 non-null uint8
dtypes: float64(2), int64(3), uint8(5)
memory usage: 39.3 KB


In [9]:
#We see there are null values in feature 'Age'. We will handle this by replacing the null values by the mean age
df_train_final['Age'] = df_train_final['Age'].fillna(df_train_final['Age'].mean())
df_train_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Survived      891 non-null int64
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Pclass_2      891 non-null uint8
Pclass_3      891 non-null uint8
Sex_male      891 non-null uint8
Embarked_Q    891 non-null uint8
Embarked_S    891 non-null uint8
dtypes: float64(2), int64(3), uint8(5)
memory usage: 39.3 KB


In [10]:
#Splitting the dataset into X(independent features) and y(target/dependent feature)
X = df_train_final.drop(['Survived'],axis=1)
y = df_train_final['Survived']

In [11]:
#Using sklearn library to further split the dataset into training and validation subset.
#1. Training dataset will be used to train the model
#2. Validation dataset will be used to evaluate our model before submission
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [12]:
#Using standard scaling on features to bring them on the same scale
from sklearn.preprocessing import StandardScaler
stdScaler = StandardScaler()

X_train = stdScaler.fit_transform(X_train)
X_test = stdScaler.fit_transform(X_test)

In [13]:
#Import Logistic Regression model from sklearn library
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression()

In [14]:
#Training the model using Training subset created earlier
logReg.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
#Evaluate the model using Validation subset
#The score is calculated based on the confusion matrix created in the classification model
logReg.score(X_test,y_test)
#print(score)

0.8134328358208955

In [16]:
#Applying data preprocessing on test data similar to what we did on the train data 
#Dropping the unimportant features from the dataset
df_test_2 = df_test.drop(columns = ['PassengerId','Name','Ticket','Cabin'])

#Convert categorical features into numerics
df_test_2['Pclass'] = df_test_2['Pclass'].astype('object')
df_test_final = pd.get_dummies(df_test_2,drop_first=True) 

#Fill missing values in Age column
df_test_final['Age'] = df_test_final['Age'].fillna(df_test_final['Age'].mean())
df_test_final['Fare'] = df_test_final['Fare'].fillna(df_test_final['Fare'].mean())


df_test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
Age           418 non-null float64
SibSp         418 non-null int64
Parch         418 non-null int64
Fare          418 non-null float64
Pclass_2      418 non-null uint8
Pclass_3      418 non-null uint8
Sex_male      418 non-null uint8
Embarked_Q    418 non-null uint8
Embarked_S    418 non-null uint8
dtypes: float64(2), int64(2), uint8(5)
memory usage: 15.2 KB


In [17]:
#Getting predictions for test data and submitting the predictions
X = df_test_final
X_scaled = stdScaler.fit_transform(X)
predictions = logReg.predict(X_scaled)

output = pd.DataFrame({'PassengerId':df_test['PassengerId'],'Survived':predictions})
output.to_csv('my_titanic_submission',index=False)
print('Your submission was successfully saved')

Your submission was successfully saved
