# Titanic DataSet Analysis

## Task- Performing Data Cleaning, and analysis on "Titanic" dataset

## Performing Data Cleaning and Analysis
#### 1. Understanding meaning of each column:
<br>Data Dictionary:
<br>**Variable        Description**</br>
1. Survived	- Survived (1) or died (0)
2. Pclass -	Passenger’s class (1 = 1st, 2 = 2nd, 3 = 3rd)
3. Name	- Passenger’s name
4. Sex -	Passenger’s sex
5. Age	- Passenger’s age
6. SibSp -	Number of siblings/spouses aboard
7. Parch -	Number of parents/children aboard (Some children travelled only with a nanny, therefore parch=0 for them.)
8. Ticket -	Ticket number
9. Fare -	Fare
10. Cabin -	Cabin
11. Embarked -	Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

#### 2. Analysing which columns are completely useless in predicting the survival and deleting them
**Note** - Don't just delete the columns because you are not finding it useful. Or focus is not on deleting the columns. Our focus is on analysing how each column is affecting the result or the prediction and in accordance with that deciding whether to keep the column or to delete the column or fill the null values of the column by some values and if yes, then what values.


In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn import preprocessing

In [2]:
train_data = pd.read_csv("training_titanic_x_y_train.csv",delimiter=",")
test_data = pd.read_csv("test_titanic_x_test.csv",delimiter=",")

In [3]:
test_data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,C.A. 33112,36.75,,S
1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S
2,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
3,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Chr...",female,24.0,2,1,243847,27.0,,S
4,1,"McGough, Mr. James Robert",male,36.0,0,0,PC 17473,26.2875,E25,S


In [4]:
train_data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S,0


In [5]:
del train_data['Name']
del test_data["Name"]

In [6]:
del train_data['Ticket']
del test_data['Ticket']

In [7]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,2,female,29.0,1,0,26.0,,S,1
1,3,male,,0,0,8.05,,S,0
2,2,male,39.0,0,0,26.0,,S,0
3,3,female,29.0,0,4,21.075,,S,0
4,3,male,25.0,0,0,7.05,,S,0


In [10]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,2,male,8.0,1,1,36.75,N,S
1,1,female,49.0,0,0,25.9292,D17,S
2,3,male,,0,0,7.7375,N,Q
3,2,female,24.0,2,1,27.0,N,S
4,1,male,36.0,0,0,26.2875,E25,S


In [8]:
train_data.Cabin.fillna("N",inplace = True)
test_data.Cabin.fillna("N",inplace = True)

In [9]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,2,female,29.0,1,0,26.0,N,S,1
1,3,male,,0,0,8.05,N,S,0
2,2,male,39.0,0,0,26.0,N,S,0
3,3,female,29.0,0,4,21.075,N,S,0
4,3,male,25.0,0,0,7.05,N,S,0


In [11]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,2,male,8.0,1,1,36.75,N,S
1,1,female,49.0,0,0,25.9292,D17,S
2,3,male,,0,0,7.7375,N,Q
3,2,female,24.0,2,1,27.0,N,S
4,1,male,36.0,0,0,26.2875,E25,S


In [12]:
train_data.Cabin = [i[0] for i in train_data.Cabin]
test_data.Cabin = [i[0] for i in test_data.Cabin]

In [13]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,2,female,29.0,1,0,26.0,N,S,1
1,3,male,,0,0,8.05,N,S,0
2,2,male,39.0,0,0,26.0,N,S,0
3,3,female,29.0,0,4,21.075,N,S,0
4,3,male,25.0,0,0,7.05,N,S,0


In [14]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,2,male,8.0,1,1,36.75,N,S
1,1,female,49.0,0,0,25.9292,D,S
2,3,male,,0,0,7.7375,N,Q
3,2,female,24.0,2,1,27.0,N,S
4,1,male,36.0,0,0,26.2875,E,S


In [16]:
train_data['Sex'] = train_data.Sex.apply(lambda x: 0 if x == "female" else 1)
test_data['Sex'] = test_data.Sex.apply(lambda x: 0 if x == "female" else 1)

In [15]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,2,female,29.0,1,0,26.0,N,S,1
1,3,male,,0,0,8.05,N,S,0
2,2,male,39.0,0,0,26.0,N,S,0
3,3,female,29.0,0,4,21.075,N,S,0
4,3,male,25.0,0,0,7.05,N,S,0


In [18]:
train_data["has_cabin"] = [0 if i == 'N'else 1 for i in train_data.Cabin]
test_data["has_cabin"] = [0 if i == 'N'else 1 for i in test_data.Cabin]

In [17]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,2,1,8.0,1,1,36.75,N,S
1,1,0,49.0,0,0,25.9292,D,S
2,3,1,,0,0,7.7375,N,Q
3,2,0,24.0,2,1,27.0,N,S
4,1,1,36.0,0,0,26.2875,E,S


In [23]:
## We are going to create a new feature "age" from the Age feature. 
train_data['child'] = [1 if i<16 else 0 for i in train_data.Age]
test_data['child'] = [1 if i<16 else 0 for i in test_data.Age]

In [22]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived,has_cabin,family_size
0,2,0,29.0,1,0,26.0,N,S,1,0,2
1,3,1,,0,0,8.05,N,S,0,0,1
2,2,1,39.0,0,0,26.0,N,S,0,0,1
3,3,0,29.0,0,4,21.075,N,S,0,0,5
4,3,1,25.0,0,0,7.05,N,S,0,0,1


In [21]:
train_data['family_size'] = train_data.SibSp + train_data.Parch+1
test_data['family_size'] = test_data.SibSp + test_data.Parch+1

In [20]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived,has_cabin
0,2,0,29.0,1,0,26.0,N,S,1,0
1,3,1,,0,0,8.05,N,S,0,0
2,2,1,39.0,0,0,26.0,N,S,0,0
3,3,0,29.0,0,4,21.075,N,S,0,0
4,3,1,25.0,0,0,7.05,N,S,0,0


In [19]:
def family_group(size):
    a = ''
    if (size <= 1):
        a = 'loner'
    elif (size <= 4):
        a = 'small'
    else:
        a = 'large'
    return a

In [25]:
train_data['family_group'] = train_data['family_size'].map(family_group)
test_data['family_group'] = test_data['family_size'].map(family_group)

In [24]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived,has_cabin,family_size,child
0,2,0,29.0,1,0,26.0,N,S,1,0,2,0
1,3,1,,0,0,8.05,N,S,0,0,1,0
2,2,1,39.0,0,0,26.0,N,S,0,0,1,0
3,3,0,29.0,0,4,21.075,N,S,0,0,5,0
4,3,1,25.0,0,0,7.05,N,S,0,0,1,0


In [26]:
train_data['is_alone'] = [1 if i<2 else 0 for i in train_data.family_size]
test_data['is_alone'] = [1 if i<2 else 0 for i in test_data.family_size]

In [28]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived,has_cabin,family_size,child,family_group,is_alone
0,2,0,29.0,1,0,26.0,N,S,1,0,2,0,small,0
1,3,1,,0,0,8.05,N,S,0,0,1,0,loner,1
2,2,1,39.0,0,0,26.0,N,S,0,0,1,0,loner,1
3,3,0,29.0,0,4,21.075,N,S,0,0,5,0,large,0
4,3,1,25.0,0,0,7.05,N,S,0,0,1,0,loner,1


In [29]:
train_data['calculated_fare'] = train_data.Fare/train_data.family_size
test_data['calculated_fare'] = test_data.Fare/test_data.family_size

In [30]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived,has_cabin,family_size,child,family_group,is_alone,calculated_fare
0,2,0,29.0,1,0,26.0,N,S,1,0,2,0,small,0,13.0
1,3,1,,0,0,8.05,N,S,0,0,1,0,loner,1,8.05
2,2,1,39.0,0,0,26.0,N,S,0,0,1,0,loner,1,26.0
3,3,0,29.0,0,4,21.075,N,S,0,0,5,0,large,0,4.215
4,3,1,25.0,0,0,7.05,N,S,0,0,1,0,loner,1,7.05


In [31]:
def fare_group(fare):
    a= ''
    if fare <= 4:
        a = 'Very_low'
    elif fare <= 10:
        a = 'low'
    elif fare <= 20:
        a = 'mid'
    elif fare <= 45:
        a = 'high'
    else:
        a = "very_high"
    return a

In [32]:
train_data['fare_group'] = train_data['calculated_fare'].map(fare_group)
test_data['fare_group'] = test_data['calculated_fare'].map(fare_group)

In [33]:
train_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived,has_cabin,family_size,child,family_group,is_alone,calculated_fare,fare_group
0,2,0,29.0,1,0,26.0,N,S,1,0,2,0,small,0,13.0,mid
1,3,1,,0,0,8.05,N,S,0,0,1,0,loner,1,8.05,low
2,2,1,39.0,0,0,26.0,N,S,0,0,1,0,loner,1,26.0,high
3,3,0,29.0,0,4,21.075,N,S,0,0,5,0,large,0,4.215,low
4,3,1,25.0,0,0,7.05,N,S,0,0,1,0,loner,1,7.05,low


In [34]:
train_data = pd.get_dummies(train_data, columns=["Pclass", 'Cabin','Embarked', 'family_group', 'fare_group'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=["Pclass",'Cabin','Embarked', 'family_group', 'fare_group'], drop_first=True)

In [35]:
train_data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Survived,has_cabin,family_size,child,is_alone,...,Cabin_N,Cabin_T,Embarked_Q,Embarked_S,family_group_loner,family_group_small,fare_group_high,fare_group_low,fare_group_mid,fare_group_very_high
0,0,29.0,1,0,26.0,1,0,2,0,0,...,1,0,0,1,0,1,0,0,1,0
1,1,,0,0,8.05,0,0,1,0,1,...,1,0,0,1,1,0,0,1,0,0
2,1,39.0,0,0,26.0,0,0,1,0,1,...,1,0,0,1,1,0,1,0,0,0
3,0,29.0,0,4,21.075,0,0,5,0,0,...,1,0,0,1,0,0,0,1,0,0
4,1,25.0,0,0,7.05,0,0,1,0,1,...,1,0,0,1,1,0,0,1,0,0


In [36]:
train_data.columns

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Survived', 'has_cabin',
       'family_size', 'child', 'is_alone', 'calculated_fare', 'Pclass_2',
       'Pclass_3', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F',
       'Cabin_G', 'Cabin_N', 'Cabin_T', 'Embarked_Q', 'Embarked_S',
       'family_group_loner', 'family_group_small', 'fare_group_high',
       'fare_group_low', 'fare_group_mid', 'fare_group_very_high'],
      dtype='object')

In [37]:
test_data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,has_cabin,family_size,child,is_alone,calculated_fare,...,Cabin_F,Cabin_N,Embarked_Q,Embarked_S,family_group_loner,family_group_small,fare_group_high,fare_group_low,fare_group_mid,fare_group_very_high
0,1,8.0,1,1,36.75,0,3,1,0,12.25,...,0,1,0,1,0,1,0,0,1,0
1,0,49.0,0,0,25.9292,1,1,0,1,25.9292,...,0,0,0,1,1,0,1,0,0,0
2,1,,0,0,7.7375,0,1,0,1,7.7375,...,0,1,1,0,1,0,0,1,0,0
3,0,24.0,2,1,27.0,0,4,0,0,6.75,...,0,1,0,1,0,1,0,1,0,0
4,1,36.0,0,0,26.2875,1,1,0,1,26.2875,...,0,0,0,1,1,0,1,0,0,0


In [38]:
test_data.columns

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'has_cabin', 'family_size',
       'child', 'is_alone', 'calculated_fare', 'Pclass_2', 'Pclass_3',
       'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_N',
       'Embarked_Q', 'Embarked_S', 'family_group_loner', 'family_group_small',
       'fare_group_high', 'fare_group_low', 'fare_group_mid',
       'fare_group_very_high'],
      dtype='object')

In [39]:
train_data.drop(['Cabin_T','Cabin_G', 'family_size', 'Fare'], axis=1, inplace=True)
test_data.drop(['family_size',"Fare"], axis=1, inplace=True)

In [40]:
front = train_data['Age']
train_data.drop(labels=['Age'], axis=1,inplace = True)
train_data.insert(0, 'Age', front)

In [41]:
front = test_data['Age']
test_data.drop(labels=['Age'], axis=1,inplace = True)
test_data.insert(0, 'Age', front)

In [42]:
train_data.Age.fillna(train_data.Age.mean(),inplace=True)
test_data.Age.fillna(test_data.Age.mean(),inplace=True)

In [43]:
def age_group_fun(age):
    a = ''
    if age <= 1:
        a = 'infant'
    elif age <= 4: 
        a = 'toddler'
    elif age <= 13:
        a = 'child'
    elif age <= 18:
        a = 'teenager'
    elif age <= 35:
        a = 'Young_Adult'
    elif age <= 45:
        a = 'adult'
    elif age <= 55:
        a = 'middle_aged'
    elif age <= 65:
        a = 'senior_citizen'
    else:
        a = 'old'
    return a

In [44]:
train_data['age_group'] = train_data['Age'].map(age_group_fun)
test_data['age_group'] = test_data['Age'].map(age_group_fun)

In [45]:
train_data = pd.get_dummies(train_data,columns=['age_group'], drop_first=True)
test_data = pd.get_dummies(test_data,columns=['age_group'], drop_first=True)

In [46]:
X_train = train_data.drop(['Survived'], axis=1)
Y_train = train_data["Survived"]

In [47]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [48]:
y_pred = clf.predict(test_data)
y_train_pred = clf.predict(X_train)

In [49]:
np.savetxt("Titanic_Predictions.csv",y_pred,fmt="%.2f")

In [50]:
from sklearn.metrics import confusion_matrix

In [51]:
confusion_matrix(Y_train,y_train_pred)

array([[349,  50],
       [ 69, 200]], dtype=int64)

In [52]:
from sklearn.metrics import classification_report

In [53]:
print(classification_report(Y_train,y_train_pred))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       399
           1       0.80      0.74      0.77       269

   micro avg       0.82      0.82      0.82       668
   macro avg       0.82      0.81      0.81       668
weighted avg       0.82      0.82      0.82       668

