In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [11]:
test = pd.read_csv('test.csv', index_col = 0)
train = pd.read_csv('train.csv', index_col = 0)
# train.head()
# train.info()
# test.info()
train.Embarked.fillna('S', inplace = True)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [12]:
%%time

### encode values: Sex
train.Sex = train.Sex.apply(lambda x: 1 if x=='male' else 0)
test.Sex = test.Sex.apply(lambda x: 1 if x=='male' else 0)

### impute missing or zero Fare values with median Fares for each Pclass
train.loc[(train.Fare==0)&(train.Pclass==1),"Fare"] = train[(train['Fare']!=0)]\
                                                                .groupby('Pclass')['Fare'].describe()['50%'][1]
train.loc[(train.Fare==0)&(train.Pclass==2),"Fare"] = train[(train['Fare']!=0)]\
                                                                .groupby('Pclass')['Fare'].describe()['50%'][2]

train.loc[(train.Fare==0)&(train.Pclass==3),"Fare"] = train[(train['Fare']!=0)]\
                                                                .groupby('Pclass')['Fare'].describe()['50%'][3]

test.loc[((pd.isnull(test['Fare'])==True)|(test.Fare==0))&(test.Pclass==1),"Fare"] = train[(train['Fare']!=0)]\
                                                                .groupby('Pclass')['Fare'].describe()['50%'][1]
test.loc[((pd.isnull(test['Fare'])==True)|(test.Fare==0))&(test.Pclass==2),"Fare"] = train[(train['Fare']!=0)]\
                                                                .groupby('Pclass')['Fare'].describe()['50%'][2]

test.loc[((pd.isnull(test['Fare'])==True)|(test.Fare==0))&(test.Pclass==3),"Fare"] = train[(train['Fare']!=0)]\
                                                                .groupby('Pclass')['Fare'].describe()['50%'][3]

# extract titles from names
def title_extractor(row):
    return row.split(',')[1].strip().split('.')[0]
titles_train = train.Name.apply(title_extractor)
titles_test = test.Name.apply(title_extractor)
titles_train = titles_train.map({'Mr':'Mr','Mrs':'Mrs','Miss':'Miss','Master':'Master','Dr':'Dr','Rev':'Rare_male',
                                   'Don':'Rare_male', 'Mlle':'Miss', 'Lady':'Rare_female', 'Ms':'Mrs', 
                                   'Mme':'Mrs', 'the Countess': 'Rare_female', 'Col':'Military','Major':'Military',
                                  'Sir':'Rare_male','Jonkheer':'Military','Capt':'Military'})
titles_test = titles_test.map({'Mr':'Mr','Mrs':'Mrs','Miss':'Miss','Master':'Master','Dr':'Dr','Rev':'Rare_male',
                                   'Don':'Rare_male', 'Mlle':'Miss', 'Lady':'Rare_female', 'Ms':'Mrs', 
                                   'Mme':'Mrs', 'the Countess': 'Rare_female', 'Col':'Military','Major':'Military',
                                  'Sir':'Rare_male','Jonkheer':'Military','Capt':'Military',
                              'Dona':'Rare_female'})

if 'Title' not in train.columns: #to not overwrite the column
    train['Title'] = titles_train
    title_encoded_train = pd.get_dummies(train.Title, prefix_sep = '_', drop_first = True)
    train = pd.concat([train, title_encoded_train], axis = 1)
if 'Title' not in test.columns:
    test['Title'] = titles_test
    title_encoded_test = pd.get_dummies(test.Title, prefix_sep = '_', drop_first = True)
    test = pd.concat([test, title_encoded_test], axis = 1)

#encode port of embarcation
train = pd.concat([train, pd.get_dummies(train.Embarked, prefix_sep = '_', drop_first = True)], axis = 1)
test = pd.concat([test, pd.get_dummies(test.Embarked, prefix_sep = '_', drop_first = True)], axis = 1)
    
# drop columns
col_types = train.dtypes.to_frame().reset_index()
cat_cols = col_types.loc[col_types[0]=='object']['index']
cat_cols

Wall time: 77.5 ms


2         Name
7       Ticket
9        Cabin
10    Embarked
11       Title
Name: index, dtype: object

In [25]:
np.arange(2,30,2)

array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28])

In [59]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb

In [98]:

### ### impute Age by prediction with random forest regressor
#y and X sets for Age
y_age = train[pd.isnull(train['Age'])==False]['Age']
X_age = train[pd.isnull(train['Age'])==False].drop(cat_cols,axis=1).drop(['Age','Survived'],axis=1)

#train and val split for Age 
X_age_train, X_age_val, y_age_train, y_age_val = train_test_split(X_age, y_age, test_size = 0.25, random_state = 21)

# select entries with missing Age
X_nullage = train[pd.isnull(train['Age'])==True].drop(cat_cols,axis=1).drop(['Age','Survived'],axis=1)

# instantiate random forest regressor
    # params = {
    #     'max_depth': np.range(10),
    #     'min_samples_leaf': np.arange(2,30,2)
    # }
    # grid_search = GridSearchCV(RandomForestRegressor, params, scoring = 'mean_square_error', cv = 5)
# age_reg = RandomForestRegressor(n_estimators = 59, max_depth=4, min_samples_leaf = 7, random_state=21)
# age_reg = GradientBoostingRegressor(n_estimators = 50, max_depth=2, min_samples_leaf = 10, random_state=21)
age_reg = xgb.XGBRegressor(n_estimators = 25, max_depth=2, random_state=21)
age_reg.fit(X_age_train, y_age_train)

# predict age for train and val sets
y_age_train_pred = age_reg.predict(X_age_train)
y_age_val_pred = age_reg.predict(X_age_val)

# check mean square erorrs and compare to simple median age imputer
print(f'train age MSE: {mean_squared_error(y_age_train, y_age_train_pred)}')
print(f'valid age MSE: {mean_squared_error(y_age_val, y_age_val_pred)}')
print(f'median age MSE: {mean_squared_error(y_age_val, np.asarray([y_age.median() for i in range(len(y_age_val))]))}')

train age MSE: 110.57126156943434
valid age MSE: 115.06043336656192
median age MSE: 236.75963240223462


In [99]:
# predict misisng Age values
age_imputed = age_reg.predict(X_nullage)
# replace null Age with predictions
X_nullage['Age'] = age_imputed
train.Age.loc[X_nullage['Age'].index] = X_nullage['Age'].copy()

### test set age imputation
test_nullage = test[pd.isnull(test['Age'])==True].drop(cat_cols,axis=1).drop('Age',axis=1)
age_test_imputed = age_reg.predict(test[pd.isnull(test['Age'])==True].drop(cat_cols,axis=1).drop('Age',axis=1))
test_nullage['Age'] = age_test_imputed
test.Age.loc[test_nullage['Age'].index] = test_nullage['Age'].copy()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [140]:
### fitting model
X, y = train.drop('Survived', axis = 1), train['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 21)
clf = RandomForestClassifier(max_depth=6, random_state=21)
# clf = xgb.XGBClassifier(n_estimators = 75, max_depth = 6, min_samples_leaf = 6, random_state = 21)
clf.fit(X_train.drop(cat_cols, axis=1), y_train)

feature_importance = pd.DataFrame(clf.feature_importances_, 
                                  index = X_train.drop(cat_cols, axis=1).columns,
                                  columns = ['importance']
                                ).sort_values('importance', ascending = False).reset_index()
print(feature_importance[:3])
# sns.catplot(y='importance', x='index', data = feature_importance, kind = 'bar').set_xticklabels(rotation=45)

print(f'train score: {clf.score(X_train.drop(cat_cols, axis=1), y_train)}') #0.83 with mean age imputer
print(f'valid score: {clf.score(X_val.drop(cat_cols, axis=1), y_val)}') #0.807175 mean age imputer for max_depth=4
                                                                        #0.860986 xgbclassifier
test_predictions = clf.predict(test.drop(cat_cols,axis=1))#.drop('Survived',axis=1))



  index  importance
0   Sex    0.215762
1    Mr    0.178715
2  Fare    0.124783
train score: 0.8707865168539326
valid score: 0.8156424581005587


In [141]:
test_prediction = test.copy()
test_prediction['Survived'] = test_predictions
test_prediction['Survived'].to_csv('gender_submission.csv', header = True)