In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd

In [2]:
# Importing the dataset of titanic and printing first 5 rows
titanic_data = pd.read_csv('C:/Users/Rahul/titanic/train.csv')
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Now we will check whether there is any null values in the dataset
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# We can see, we have more than 177 passenger whose age is missing. Look also Cabin column, where we have missing 
# data of aroung 687 passenger.

# Now I'm going to drop some columns which is not much useful in predicting.
titanic_new = titanic_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [5]:
# looking at new Dataframe titanic_new.
titanic_new.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [6]:
# I'm using SimpleImputer to fill all empty numerical column values with their mean values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')

In [7]:
# To do this, I need to keep only numerical columns
titanic_num = titanic_new.drop(['Embarked', 'Sex'], axis=1) # My new Dataframe of only numerical attributes

In [8]:
imputer.fit(titanic_num) # Now fitting new Dataframe titanic_num

SimpleImputer()

In [9]:
imputer.statistics_ # mean value of each column

array([4.46000000e+02, 3.83838384e-01, 2.30864198e+00, 2.96991176e+01,
       5.23007856e-01, 3.81593715e-01, 3.22042080e+01])

In [12]:
X = imputer.transform(titanic_num) # Transforming it finally and X is our new numerical dataframe without labeled columns 
X[:5, :] # First 5 rows with all columns with no label

array([[ 1.    ,  0.    ,  3.    , 22.    ,  1.    ,  0.    ,  7.25  ],
       [ 2.    ,  1.    ,  1.    , 38.    ,  1.    ,  0.    , 71.2833],
       [ 3.    ,  1.    ,  3.    , 26.    ,  0.    ,  0.    ,  7.925 ],
       [ 4.    ,  1.    ,  1.    , 35.    ,  1.    ,  0.    , 53.1   ],
       [ 5.    ,  0.    ,  3.    , 35.    ,  0.    ,  0.    ,  8.05  ]])

In [13]:
titanic_transform = pd.DataFrame(X, columns=titanic_num.columns, index=titanic_num.index) # Now giving it back to old shape with labeled columns

In [15]:
titanic_transform.head() # Back to old shape

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1.0,0.0,3.0,22.0,1.0,0.0,7.25
1,2.0,1.0,1.0,38.0,1.0,0.0,71.2833
2,3.0,1.0,3.0,26.0,0.0,0.0,7.925
3,4.0,1.0,1.0,35.0,1.0,0.0,53.1
4,5.0,0.0,3.0,35.0,0.0,0.0,8.05


In [16]:
titanic_transform.info() # Now we don't have any null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    float64
 1   Survived     891 non-null    float64
 2   Pclass       891 non-null    float64
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    float64
 5   Parch        891 non-null    float64
 6   Fare         891 non-null    float64
dtypes: float64(7)
memory usage: 48.9 KB


In [39]:
# Now divide your data into predictor and prediction... here Survivor column is prediction and rest of all are predictors
X = titanic_transform.drop('Survived', axis=1)
y = titanic_transform['Survived'].copy() # It will give 1-D array

In [36]:
X.head() # Predictors(Independant variables)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,1.0,3.0,22.0,1.0,0.0,7.25
1,2.0,1.0,38.0,1.0,0.0,71.2833
2,3.0,3.0,26.0,0.0,0.0,7.925
3,4.0,1.0,35.0,1.0,0.0,53.1
4,5.0,3.0,35.0,0.0,0.0,8.05


In [40]:
y # Prediction (Dependant variable)

0      0.0
1      1.0
2      1.0
3      1.0
4      0.0
      ... 
886    0.0
887    1.0
888    0.0
889    1.0
890    0.0
Name: Survived, Length: 891, dtype: float64

In [50]:
# Now Dividing data into train and test part
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) # Choosing test size 20% of our train data

In [51]:
# Using Logistic regression method, we are training our model
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter=200,random_state=42).fit(X_train, y_train) # Choosing random state so that our train and test data don't get mixed

In [52]:
LR.predict(X_test) # Predicting test data which we kept for testing

array([0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0.,
       1., 1., 1., 0., 0., 0., 0., 1., 0.])

In [53]:
# Let's check the score 
LR.score(X_test, y_test) # It gives 73.18% accuracy

0.7318435754189944

In [54]:
# okay let's try Logistic regression CV once
from sklearn.linear_model import LogisticRegressionCV
LR_CV = LogisticRegressionCV(cv=5, max_iter=300).fit(X_train, y_train) # choosing cross validate = 5 fold

In [55]:
LR_CV.predict(X_test)

array([0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0.,
       1., 1., 1., 0., 0., 0., 0., 0., 0.])

In [87]:
LR_CV.score(X_test, y_test) # Not much difference

0.7430167597765364

<h3>Now we will add Sex column which we had not included yet</h3>

In [56]:
# Now we will transform our main column of Sex which will play a role to increase our accuracy. We need to convert it into 
# numerical attribute using Ordinalencoding.
from sklearn.preprocessing import OrdinalEncoder

In [58]:
ord_encoder = OrdinalEncoder()
titanic_cat_sex = np.c_[titanic_new['Sex']] # It will give us 2-D array

In [71]:
sex_encoded = ord_encoder.fit_transform(titanic_cat_sex)
sex_encoded[:5]

array([[1.],
       [0.],
       [0.],
       [0.],
       [1.]])

In [76]:
titanic_transform['Sex'] = sex_encoded             #   and adding new numerical sex attribute

In [77]:
titanic_transform.head() # Final transformed data

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex
0,1.0,0.0,3.0,22.0,1.0,0.0,7.25,1.0
1,2.0,1.0,1.0,38.0,1.0,0.0,71.2833,0.0
2,3.0,1.0,3.0,26.0,0.0,0.0,7.925,0.0
3,4.0,1.0,1.0,35.0,1.0,0.0,53.1,0.0
4,5.0,0.0,3.0,35.0,0.0,0.0,8.05,1.0


In [78]:
titanic_transform.info() # Checking again for null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    float64
 1   Survived     891 non-null    float64
 2   Pclass       891 non-null    float64
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    float64
 5   Parch        891 non-null    float64
 6   Fare         891 non-null    float64
 7   Sex          891 non-null    float64
dtypes: float64(8)
memory usage: 55.8 KB


In [79]:
# Again dividing data into predictor and prediction
X_new = titanic_transform.drop(['Survived'], axis=1)
y_new = titanic_transform['Survived'].copy()

In [82]:
# Once again, Training and testing
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.30, random_state=42)
X_train_new.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex
445,446.0,1.0,4.0,0.0,2.0,81.8583,1.0
650,651.0,3.0,29.699118,0.0,0.0,7.8958,1.0
172,173.0,3.0,1.0,1.0,1.0,11.1333,0.0
450,451.0,2.0,36.0,1.0,2.0,27.75,1.0
314,315.0,2.0,43.0,1.0,1.0,26.25,1.0


In [85]:
# Checking once again prediction after adding Sex attribute
LR = LogisticRegression(max_iter=200).fit(X_train_new, y_train_new)
LR.predict(X_test_new)

array([0., 0., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0.,
       0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1.,
       0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 0., 1.,
       1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0.,
       1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0.,
       0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0.,
       1., 0., 0., 0., 1.

In [86]:
LR.score(X_test_new, y_test_new) # Decent change in prediction

0.8097014925373134

In [88]:
test_data = pd.read_csv('C:/Users/Rahul/titanic/test.csv') # Now importing actual test set so that you can predict the survivors
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [89]:
# Again fixing the dataframe for fitting into the model for prediction
test_data = test_data.drop(['Name', 'Ticket','Cabin', 'Embarked'], axis=1)

In [92]:
# Fixing sex column by transforming it using ordinalencoder
sex_cat_test_data = np.c_[test_data['Sex']]
test_data_sex_encoded = ord_encoder.fit_transform(sex_cat_test_data)

In [93]:
test_data_sex_encoded[:5]

array([[1.],
       [0.],
       [1.],
       [1.],
       [0.]])

In [94]:
test_data_num_attr = test_data.drop(['Sex', 'PassengerId'], axis=1)
test_PId = test_data['PassengerId'].copy()
test_data_num_attr

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,34.5,0,0,7.8292
1,3,47.0,1,0,7.0000
2,2,62.0,0,0,9.6875
3,3,27.0,0,0,8.6625
4,3,22.0,1,1,12.2875
...,...,...,...,...,...
413,3,,0,0,8.0500
414,1,39.0,0,0,108.9000
415,3,38.5,0,0,7.2500
416,3,,0,0,8.0500


In [95]:
# Fixing nan values
test_data_imputer = imputer.fit_transform(test_data_num_attr)
test_data_imputer

array([[ 3.        , 34.5       ,  0.        ,  0.        ,  7.8292    ],
       [ 3.        , 47.        ,  1.        ,  0.        ,  7.        ],
       [ 2.        , 62.        ,  0.        ,  0.        ,  9.6875    ],
       ...,
       [ 3.        , 38.5       ,  0.        ,  0.        ,  7.25      ],
       [ 3.        , 30.27259036,  0.        ,  0.        ,  8.05      ],
       [ 3.        , 30.27259036,  1.        ,  1.        , 22.3583    ]])

In [99]:
# Transforming back into dataframe
test_data_imputer = pd.DataFrame(test_data_imputer, columns=test_data_num_attr.columns, index=test_data_num_attr.index)
test_data_imputer.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3.0,34.5,0.0,0.0,7.8292
1,3.0,47.0,1.0,0.0,7.0
2,2.0,62.0,0.0,0.0,9.6875
3,3.0,27.0,0.0,0.0,8.6625
4,3.0,22.0,1.0,1.0,12.2875


In [110]:
# Adding sex , passengerid column and then rearranging columns in exact position like we had trained data in the model
test_data_imputer['Sex'] = test_data_sex_encoded
test_data_imputer['PassengerId'] = test_PId
test_data_imputer = test_data_imputer[['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex']]
test_data_imputer.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex
0,892,3.0,34.5,0.0,0.0,7.8292,1.0
1,893,3.0,47.0,1.0,0.0,7.0,0.0
2,894,2.0,62.0,0.0,0.0,9.6875,1.0
3,895,3.0,27.0,0.0,0.0,8.6625,1.0
4,896,3.0,22.0,1.0,1.0,12.2875,0.0


In [103]:
# Now prediciton time
test_survived = LR.predict(test_data_imputer)
test_survived[:20]

array([0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 1., 1.])

In [104]:
# Adding Survived column to the dataframe for submission
test_data_imputer['Survived'] = test_survived

In [105]:
# making survived column to int form for submission because float value will cause error
test_data_imputer['Survived'] = test_data_imputer['Survived'].astype(int)
test_data_imputer.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex,Survived
0,892,3.0,34.5,0.0,0.0,7.8292,1.0,0
1,893,3.0,47.0,1.0,0.0,7.0,0.0,0
2,894,2.0,62.0,0.0,0.0,9.6875,1.0,0
3,895,3.0,27.0,0.0,0.0,8.6625,1.0,0
4,896,3.0,22.0,1.0,1.0,12.2875,0.0,1


In [107]:
# final data
final_test_data = test_data_imputer.drop(['Pclass', 'Age', 'SibSp', 'Parch', 'Sex', 'Fare'], axis=1)
final_test_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [108]:
# Let's save it and don't forget to set index=False while
# saving it because it cause error while submission(As it will save indexing and making it 3 columns)
final_test_data.to_csv('Kaggle_titanic_submission.csv', index=False)
