### Feature Construction and Splitting 

In [65]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.linear_model import LogisticRegression


In [66]:
df=pd.read_csv('titanic_dataset.csv',usecols=['Survived','Pclass','SibSp','Parch','Age'])

In [67]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,34.5,0,0
1,1,3,47.0,1,0
2,0,2,62.0,0,0
3,0,3,27.0,0,0
4,1,3,22.0,1,1


In [68]:
df.dropna(inplace=True)

In [69]:
X=df.iloc[:,1:]
y=df.iloc[:,0]

In [70]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch
0,3,34.5,0,0
1,3,47.0,1,0
2,2,62.0,0,0
3,3,27.0,0,0
4,3,22.0,1,1
...,...,...,...,...
409,3,3.0,1,1
411,1,37.0,1,0
412,3,28.0,0,0
414,1,39.0,0,0


In [71]:
y

0      0
1      1
2      0
3      0
4      1
      ..
409    1
411    1
412    1
414    1
415    0
Name: Survived, Length: 332, dtype: int64

In [72]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test=train_test_split(X,y,test_size=0.2)

In [73]:
X_train

Unnamed: 0,Pclass,Age,SibSp,Parch
187,3,17.0,2,0
119,2,29.0,1,0
238,2,18.0,1,1
348,2,24.0,0,0
405,2,20.0,0,0
...,...,...,...,...
122,1,35.0,1,0
25,3,50.0,1,0
404,1,43.0,1,0
89,2,2.0,1,1


In [74]:
y_train

187    0
119    1
238    1
348    0
405    0
      ..
122    1
25     0
404    0
89     0
112    1
Name: Survived, Length: 265, dtype: int64

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
np.mean(cross_val_score(LogisticRegression(),X_train,y_train,scoring='accuracy',cv=10))

0.6190883190883191

### Feature Construction 

In [76]:
df.sample(10)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
11,0,1,46.0,0,0
179,1,1,64.0,0,2
13,0,2,63.0,1,0
376,1,3,22.0,2,0
217,0,1,57.0,1,1
214,1,3,38.0,4,2
120,1,2,12.0,0,0
62,0,3,18.0,0,0
7,0,2,26.0,1,1
294,0,3,36.0,0,0


In [77]:
df['FamilySize']=df['SibSp']+df['Parch']+1

In [78]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,FamilySize
0,0,3,34.5,0,0,1
1,1,3,47.0,1,0,2
2,0,2,62.0,0,0,1
3,0,3,27.0,0,0,1
4,1,3,22.0,1,1,3


In [79]:
def family_type(num):
    if num==1:
        return 0 #single
    elif num <=4:
        return 1 #small family
    else:
        return 2 #big family

In [80]:
df['FamilyType']=df['FamilySize'].apply(family_type)

In [81]:
df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,FamilySize,FamilyType
0,0,3,34.5,0,0,1,0
1,1,3,47.0,1,0,2,1
2,0,2,62.0,0,0,1,0
3,0,3,27.0,0,0,1,0
4,1,3,22.0,1,1,3,1
...,...,...,...,...,...,...,...
409,1,3,3.0,1,1,3,1
411,1,1,37.0,1,0,2,1
412,1,3,28.0,0,0,1,0
414,1,1,39.0,0,0,1,0


In [82]:
df_new=df.drop(['SibSp','Parch','FamilySize'],axis=1)

In [83]:
df_new

Unnamed: 0,Survived,Pclass,Age,FamilyType
0,0,3,34.5,0
1,1,3,47.0,1
2,0,2,62.0,0
3,0,3,27.0,0
4,1,3,22.0,1
...,...,...,...,...
409,1,3,3.0,1
411,1,1,37.0,1
412,1,3,28.0,0
414,1,1,39.0,0


In [84]:
X1=df_new.iloc[:,1:]
y1=df_new.iloc[:,0]

In [85]:
X_train1, X_test1,y_train1, y_test1=train_test_split(X1,y1,test_size=0.2)

In [86]:
np.mean(cross_val_score(LogisticRegression(),X_train1,y_train1,scoring='accuracy',cv=10))

0.626068376068376

### Feature Splitting 

In [88]:
new_df=pd.read_csv('titanic_dataset.csv')

In [89]:
new_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [97]:
new_df['Name']

0                                  Kelly, Mr. James
1                  Wilkes, Mrs. James (Ellen Needs)
2                         Myles, Mr. Thomas Francis
3                                  Wirz, Mr. Albert
4      Hirvonen, Mrs. Alexander (Helga E Lindqvist)
                           ...                     
413                              Spector, Mr. Woolf
414                    Oliva y Ocana, Dona. Fermina
415                    Saether, Mr. Simon Sivertsen
416                             Ware, Mr. Frederick
417                        Peter, Master. Michael J
Name: Name, Length: 418, dtype: object

In [105]:
new_df['Salutation']=new_df['Name'].str.split(',',expand=True)[1].str.split('.',expand=True)[0]

In [106]:
new_df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Salutation
77,969,1,1,"Cornell, Mrs. Robert Clifford (Malvina Helen L...",female,55.0,2,0,11770,25.7,C101,S,Mrs,Mrs
78,970,0,2,"Aldworth, Mr. Charles Augustus",male,30.0,0,0,248744,13.0,,S,Mr,Mr
24,916,1,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48.0,1,3,PC 17608,262.375,B57 B59 B63 B66,C,Mrs,Mrs
409,1301,1,3,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.775,,S,Miss,Miss
376,1268,1,3,"Kink, Miss. Maria",female,22.0,2,0,315152,8.6625,,S,Miss,Miss


In [113]:
(new_df.groupby('Salutation')['Survived'].mean()).sort_values(ascending=False)

Salutation
Dona      1.0
Miss      1.0
Mrs       1.0
Ms        1.0
Col       0.0
Dr        0.0
Master    0.0
Mr        0.0
Rev       0.0
Name: Survived, dtype: float64