Titanic dataset is one of the datasets available in sklearn.
You are given:
1. A Training dataset csv file with X train and Y train data
2. A X test File and you have to predict and submit predictions for this file.

Your task is to:
1. Use Logistic Regression and come with predictions.

### Imports needed

In [65]:
from sklearn import datasets
from sklearn.linear_model import LogisticRegression

import numpy as np
import pandas as pd


### Load training data 

In [66]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S,0


### Load testing data

In [67]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,C.A. 33112,36.75,,S
1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S
2,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
3,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Chr...",female,24.0,2,1,243847,27.0,,S
4,1,"McGough, Mr. James Robert",male,36.0,0,0,PC 17473,26.2875,E25,S


## Clean our data
Bcz our data contains lot of missing values(NaN) and also some columns are of string type. We need to convert that into integer also and drop some unnecessary info

In [68]:
df.isnull()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,False,False,False,False,False,False,False,False,True,False,False
1,False,False,False,True,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
663,False,False,False,False,False,False,False,False,True,False,False
664,False,False,False,True,False,False,False,False,True,False,False
665,False,False,False,False,False,False,False,False,True,False,False
666,False,False,False,False,False,False,False,False,True,False,False


In [69]:
df_test.isnull()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,False,False,False,False,False,False,False,True,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,True,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
218,False,False,False,False,False,False,False,False,True,False
219,False,False,False,False,False,False,False,False,False,False
220,False,False,False,False,False,False,False,False,False,False
221,False,False,False,False,False,False,False,False,True,False


### Check null value in columns of training data

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668 entries, 0 to 667
Data columns (total 11 columns):
Pclass      668 non-null int64
Name        668 non-null object
Sex         668 non-null object
Age         536 non-null float64
SibSp       668 non-null int64
Parch       668 non-null int64
Ticket      668 non-null object
Fare        668 non-null float64
Cabin       154 non-null object
Embarked    667 non-null object
Survived    668 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 44.4+ KB


### Check null value in columns of testing data

In [71]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 10 columns):
Pclass      223 non-null int64
Name        223 non-null object
Sex         223 non-null object
Age         178 non-null float64
SibSp       223 non-null int64
Parch       223 non-null int64
Ticket      223 non-null object
Fare        223 non-null float64
Cabin       50 non-null object
Embarked    222 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 13.1+ KB


In [72]:
# number of pclass
df['Pclass'].value_counts()

3    359
1    161
2    148
Name: Pclass, dtype: int64

### Fill missing Age in training data

In [73]:
'''
    From above we get that
        1. Age column have few missing values. We can fill that values using mean of Age or we can separately fill mean age 
           of particular pclass
        2. Cabin contains so many null values. So we can drop that
        3. Embarked contain 1 null value
'''

# calculate different average age for each pclass. There are 3 different Pclass
df_p1 = df[df['Pclass']==1]
p1_mean = df_p1['Age'].mean()

df_p2 = df[df['Pclass']==2]
p2_mean = df_p2['Age'].mean()

df_p3 = df[df['Pclass']==3]
p3_mean = df_p3['Age'].mean()

print(p1_mean, p2_mean, p3_mean)

37.12623188405797 29.549492753623188 25.839423076923076


In [74]:
# Now fill null age of particular class with their mean
def fill_null_age(cols):
    age = cols[0]
    pclass = cols[1]
    
    if pd.isnull(age):
        if pclass==1:
            return p1_mean
        elif pclass==2:
            return p2_mean
        else:
            return p3_mean
    else:
        return age
    

In [75]:
df['Age'] = df[['Age','Pclass']].apply(fill_null_age , axis=1)

In [76]:
df.info() # Now age column is completely filled

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668 entries, 0 to 667
Data columns (total 11 columns):
Pclass      668 non-null int64
Name        668 non-null object
Sex         668 non-null object
Age         668 non-null float64
SibSp       668 non-null int64
Parch       668 non-null int64
Ticket      668 non-null object
Fare        668 non-null float64
Cabin       154 non-null object
Embarked    667 non-null object
Survived    668 non-null int64
dtypes: float64(2), int64(4), object(5)
memory usage: 44.4+ KB


### Fill missing Age in testing data

In [77]:
# calculate different average age for each pclass. There are 3 different Pclass
df_test_p1 = df_test[df_test['Pclass']==1]
p1_test_mean = df_test_p1['Age'].mean()

df_test_p2 = df_test[df_test['Pclass']==2]
p2_test_mean = df_test_p2['Age'].mean()

df_test_p3 = df_test[df_test['Pclass']==3]
p3_test_mean = df_test_p3['Age'].mean()

print(p1_test_mean, p2_test_mean, p3_test_mean)

41.416666666666664 31.17142857142857 23.228105263157897


In [78]:
# Now fill null age of particular class with their mean
def fill_null_test_age(cols):
    age = cols[0]
    pclass = cols[1]
    
    if pd.isnull(age):
        if pclass==1:
            return p1_test_mean
        elif pclass==2:
            return p2_test_mean
        else:
            return p3_test_mean
    else:
        return age
    

In [79]:
df_test['Age'] = df_test[['Age','Pclass']].apply(fill_null_age , axis=1)

In [80]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 10 columns):
Pclass      223 non-null int64
Name        223 non-null object
Sex         223 non-null object
Age         223 non-null float64
SibSp       223 non-null int64
Parch       223 non-null int64
Ticket      223 non-null object
Fare        223 non-null float64
Cabin       50 non-null object
Embarked    222 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 13.1+ KB


### Drop cabin from training data

In [81]:
# Drop Cabin column 
df.drop('Cabin', axis=True, inplace=True)

# Also Embarked column contain 1 null value. So remove that also
df.dropna(inplace=True)

# now no null values present in dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 667 entries, 0 to 667
Data columns (total 10 columns):
Pclass      667 non-null int64
Name        667 non-null object
Sex         667 non-null object
Age         667 non-null float64
SibSp       667 non-null int64
Parch       667 non-null int64
Ticket      667 non-null object
Fare        667 non-null float64
Embarked    667 non-null object
Survived    667 non-null int64
dtypes: float64(2), int64(4), object(4)
memory usage: 46.9+ KB


### Drop cabin from testing data

In [82]:
# Drop Cabin column 
df_test.drop('Cabin', axis=True, inplace=True)

# Also Embarked column contain 1 null value. So remove that also
#df_test.dropna(inplace=True)

# now no null values present in dataframe
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223 entries, 0 to 222
Data columns (total 9 columns):
Pclass      223 non-null int64
Name        223 non-null object
Sex         223 non-null object
Age         223 non-null float64
SibSp       223 non-null int64
Parch       223 non-null int64
Ticket      223 non-null object
Fare        223 non-null float64
Embarked    222 non-null object
dtypes: float64(2), int64(3), object(4)
memory usage: 12.3+ KB


#### Convert Sex column into isMale column (or you can convert into isFemale also) bcz it contains 2 string values: Male and Female; but we need to convert it into 0's and 1's.
#### But we do not need to make 2 separate columns, bcz it is possible that our training data only contains male So  ismale column contains 1 and isfemale column contains 0. So when female data comes during testing, it doesn't predict accurate result

In [83]:
def get_numerical_gender(sex):
    if sex=='female':
        return 0
    elif sex=='male':
        return 1

In [84]:
# on training data
df['isMale'] = df['Sex'].apply(get_numerical_gender)

# on testing data
df_test['isMale'] = df_test['Sex'].apply(get_numerical_gender)

In [85]:
df['Embarked'].value_counts()

S    484
C    133
Q     50
Name: Embarked, dtype: int64

In [86]:
# embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
# From above we see that Embarked has 3 types of string values. We want to convert that into separate columns of o's and 1's.

# For training data
embark = pd.get_dummies(df['Embarked'], drop_first=True)
print(embark)

# For testing data
embark_test = pd.get_dummies(df_test['Embarked'], drop_first=True)
print(embark_test)

     Q  S
0    0  1
1    0  1
2    0  1
3    0  1
4    0  1
..  .. ..
663  0  1
664  1  0
665  0  1
666  0  1
667  1  0

[667 rows x 2 columns]
     Q  S
0    0  1
1    0  1
2    1  0
3    0  1
4    0  1
..  .. ..
218  0  1
219  0  1
220  0  0
221  0  1
222  0  1

[223 rows x 2 columns]


### Add new columns

In [87]:
# Add embark dataframe which contains (Q and S column)
df = pd.concat([df,embark], axis=1)

# Add embark_test dataframe which contains (Q and S column)
df_test = pd.concat([df_test,embark_test], axis=1)

### Drop Unnecessary columns

In [88]:
# From training Data
df.drop(['Sex', 'Name','Embarked', 'Ticket'], axis=1, inplace=True)

# From testing Data
df_test.drop(['Sex', 'Name','Embarked', 'Ticket'], axis=1, inplace=True)

In [89]:
df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Survived,isMale,Q,S
0,2,29.0,1,0,26.0,1,0,0,1
1,3,25.839423,0,0,8.05,0,1,0,1
2,2,39.0,0,0,26.0,0,1,0,1
3,3,29.0,0,4,21.075,0,0,0,1
4,3,25.0,0,0,7.05,0,1,0,1


In [90]:
df_test.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,isMale,Q,S
0,2,8.0,1,1,36.75,1,0,1
1,1,49.0,0,0,25.9292,0,0,1
2,3,25.839423,0,0,7.7375,1,1,0
3,2,24.0,2,1,27.0,0,0,1
4,1,36.0,0,0,26.2875,1,0,1


## Get X_train and Y_train from df

In [91]:
Y_train = np.array(df['Survived'])

# Drop Survived from rest of DataFrame
df.drop('Survived', axis = 1, inplace=True)

# convert dataframe into numpy
X_train = df.to_numpy(copy=True)


print(X_train.shape)
print(Y_train.shape)

(667, 8)
(667,)


## X_test

In [92]:
X_test = df_test.to_numpy()

In [93]:
X_test.shape

(223, 8)

## Train Model

In [94]:
clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [95]:
y_predictions = clf.predict(X_test)

In [96]:
y_predictions = y_predictions.reshape(-1,1)
print(y_predictions.shape)

(223, 1)


In [97]:
np.savetxt('predictions.csv', y_predictions, delimiter=',')