In [90]:
# load the data
import os
import pandas as pd
train_csv = os.path.join("..","dataset","train.csv")
test_csv = os.path.join("..","dataset","test.csv")

# load the csv into dataframe
titanic_train = pd.read_csv(train_csv)
titanic_test = pd.read_csv(test_csv)

## Analysis
### 12 columns, 1 target and 11 predictor
### seems not useful columns: PassengerId, Name, Ticket
### Cabin contains 90% null values, better to drop it

In [99]:
# separate out target and predictors
y = titanic_train['Survived']
X = titanic_train.copy()
X.drop(['Survived'], axis=1, inplace=True)
# remove the columns that seems unuseful to the problem
useless_cols = ['PassengerId','Name', 'Ticket','Cabin']
X.drop(useless_cols, axis=1, inplace=True)

In [100]:
# divide the data into training and testing
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(X,y, random_state=0, train_size=0.8)

# Analyze the train and testing data further to get missing values
### Age has some missing values approx 10% and Embarked have very few missing values
### Age need to be imputed with mean value and Embarked with mode
### in test split, only Age has missing values


In [101]:
# handle the missing values
from sklearn.impute import SimpleImputer

ni = SimpleImputer(strategy='mean')
ci = SimpleImputer(strategy='most_frequent')
X_train_i = X_train.copy()
X_test_i = X_test.copy()

cat_cols = [col for col in X_train.columns if X_train[col].dtype=='object']
num_cols = [col for col in X_train.columns if X_train[col].dtype != 'object']
print("Categorical cols are: {}".format(cat_cols))
print("numerical cols are: {}".format(num_cols))

X_train_i.loc[:,cat_cols] = ci.fit_transform(X_train_i[cat_cols])
X_test_i.loc[:,cat_cols] = ci.transform(X_test_i[cat_cols])

X_train_i.loc[:,num_cols] = ni.fit_transform(X_train_i[num_cols])
X_test_i.loc[:,num_cols] = ni.transform(X_test_i[num_cols])

Categorical cols are: ['Sex', 'Embarked']
numerical cols are: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']


In [115]:
# Now the missing values are being taken care of, lets focus on encoding categorical cols
from sklearn.preprocessing import OneHotEncoder
X_train_ie = X_train_i.copy()
X_test_ie = X_test_i.copy()

ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

ohe_train = pd.DataFrame(ohe.fit_transform(X_train_ie[cat_cols]))
ohe_test = pd.DataFrame(ohe.transform(X_test_ie[cat_cols]))
# X_train_ie.index = X_train_i.index
# X_test_ie.index = X_test_i.index
ohe_train.index = X_train_ie.index
ohe_test.index = X_test_ie.index

# drop cat cols from dataset and add encoded
X_train_ie.drop(cat_cols,axis=1, inplace=True)
X_test_ie.drop(cat_cols, axis=1, inplace=True)

X_train_p = pd.concat([X_train_ie,ohe_train], axis=1)
X_test_p = pd.concat([X_test_ie, ohe_test], axis=1)

In [133]:
# time to model the class
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def rfe_eval_score(n_estimators, X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(random_state=0, n_estimators=n_estimators)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    error = mean_absolute_error(y_test,predictions)
    print(y_test.head())
    print(predictions[:5])
    return error



In [134]:
score = rfe_eval_score(500, X_train_p, X_test_p, y_train, y_test)
print(score)



495    0
648    0
278    0
31     1
255    1
Name: Survived, dtype: int64
[0.364 0.019 0.008 0.73  0.184]
0.23129080616684955
