In [162]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [113]:
df = pd.read_csv("train.csv")
del df['Cabin'], df['PassengerId'], df['Ticket']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Fare      891 non-null    float64
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [53]:
print("Southampton:", sum(df['Embarked'] == 'S'))
print("Cherbourg:", sum(df['Embarked'] == 'C'))
print("Queenstown:", sum(df['Embarked'] == 'Q'))
print("Null", sum(df["Embarked"].isnull()))

Southampton: 644
Cherbourg: 168
Queenstown: 77
Null 2


In [117]:
df.at[61, "Embarked"] = "C" #Since name is french assign cherbourgh
df.at[829, "Embarked"] = "S" #The name here count be from either Ireland or England so 
                            #S is assigned since it is vastly more common

In [123]:
def title(name, titles):
    for title in titles:
        if title in name:
            return title
    return None

title_list = ['Mrs', 'Mr', 'Miss', 'Master']

df['Title'] = df['Name'].map(lambda x: title(x, title_list))

def impute_Nones(x):
    if pd.isnull(x['Title']):
        if x['Sex'] == 'male':
            if x['Age'] <= 12:
                return 'Master'
            else:
                return 'Mr'
        elif x['Sex'] == 'female':
            if x['Age'] >= 23:
                return 'Mrs'
            else:
                return 'Miss'
    else:
        return x['Title']

df['Title'] = df.apply(impute_Nones, axis=1)
del df['Name']
#print(df.info())
for title in title_list:
    print(title, ': ', sum(df['Title'] == title))

Mrs :  129
Mr :  518
Miss :  180
Master :  40


In [104]:
mrs = df.loc[df['Title'] == 'Mrs']
mrs_mean_age = np.mean(mrs['Age'])

mr = df.loc[df['Title'] == 'Mr']
mr_mean_age = np.mean(mr['Age'])

miss = df.loc[df['Title'] == 'Miss']
miss_mean_age = np.mean(miss['Age'])

master = df.loc[df['Title'] == 'Master']
master_mean_age = np.mean(master['Age'])

print('mrs_mean_age:', mrs_mean_age)
print('mr_mean_age:', mr_mean_age)
print('miss_mean_age:', miss_mean_age)
print('master_mean_age:', master_mean_age)

mrs_mean_age: 35.47008547008547
mr_mean_age: 32.98441247002398
miss_mean_age: 21.77777777777778
master_mean_age: 4.574166666666667


In [115]:
def impute_ages(x):
    if pd.isnull(x['Age']):
        if x['Title'] == 'Mrs':
            return 35
        elif x['Title'] == 'Master':
            return 4.5
        elif x['Title'] == 'Miss':
            return 22
        else:
            return 33
    else:
        return x['Age']

df['Age'] = df.apply(impute_ages, axis=1)

In [124]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int32  
 8   Title     867 non-null    object 
dtypes: float64(2), int32(2), int64(4), object(1)
memory usage: 55.8+ KB


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,1,22.0,1,0,7.25,2,Mr
1,1,1,0,38.0,1,0,71.2833,0,Mrs
2,1,3,0,26.0,0,0,7.925,2,Miss
3,1,1,0,35.0,1,0,53.1,2,Mrs
4,0,3,1,35.0,0,0,8.05,2,Mr


In [125]:
encoder = preprocessing.LabelEncoder()

features = ['Sex', 'Embarked', 'Title']
for feature in features:
    df[feature] = encoder.fit_transform(df[feature])

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,1,22.0,1,0,7.25,2,2
1,1,1,0,38.0,1,0,71.2833,0,3
2,1,3,0,26.0,0,0,7.925,2,1
3,1,1,0,35.0,1,0,53.1,2,3
4,0,3,1,35.0,0,0,8.05,2,2


In [203]:
#make test and train data
y = df['Survived']
X = df.drop(['Survived'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [204]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_depth=2, random_state=0)


print('cross_val_score: ', np.mean(cross_val_score(clf, X, y, cv=5)))
clf = clf.fit(x_train, y_train)


cross_val_score:  0.824913690289373


In [205]:
clf1 = LogisticRegression(max_iter=1000).fit(x_train, y_train)
clf1.score(x_test, y_test)
print('cross_val_score: ', np.mean(cross_val_score(clf1, X, y, cv=5)))

cross_val_score:  0.7946331052664617
