In [13]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from random import randrange

In [18]:
def cat2num(x):
    if x == 'C': x = 0
    elif x == 'Q': x = 1
    else: x = 2
    return x

def split4cv(data4split, k = 3):  
    k_folds = list() # k_folds = [fold1=[[X],[y]], fold2=[[X],[y]] ...]
    fold_size = data4split.shape[0] / k 
    X,y = list(data4split.drop('Survived', axis=1).values), list(data4split['Survived'].values)
    for i in range(k): # make k_folds list 
        fold = list()
        tmp_X = list()
        tmp_y = list()
        while len(tmp_X) < fold_size:
            index = randrange(len(X))  
            tmp_X.append(X.pop(index))
            tmp_y.append(y.pop(index))
        fold.append([tmp_X,tmp_y])
        k_folds.append(fold) 
    return k_folds, k
  
def cross_validation(data4cv, model=randomforest):
    k_folds, k = split4cv(data4cv)
    X, y = [k_folds[i][0][0] for i in range(k)], [k_folds[i][0][1] for i in range(k)]
    # calculate empirical risk:
    scores4train = []
    scores4test  = []
    for i in range(k):
        X_train = np.concatenate([X[j] for j in range(k) if j != i])
        y_train = np.concatenate([y[j] for j in range(k) if j != i])
        X_test = X[i]
        y_test = np.array(y[i])
        model.fit(X_train, y_train)
        # emprical risk on train set
        y_pred = model.predict(X_train)
        empirical_risk_train = np.sum((y_pred - y_train)**2)
        scores4train.append(empirical_risk_train)
        # empirical resk on test set
        y_pred = model.predict(X_test)
        empirical_risk_test = np.sum((y_pred - y_test)**2)
        scores4test.append(empirical_risk_test)
    print('Average Empirical Risk (TRAIN_SET): {}'.format(np.mean(np.array(scores4train))))
    print('Average Empirical Risk (TEST_SET): {}'.format(np.mean(np.array(scores4test))))
    return np.mean(np.array(scores4train)), np.mean(np.array(scores4test))

In [17]:
# Reading data
train_set = pd.read_csv('datasets/titanic/train.csv', index_col = 'PassengerId')
test_set = pd.read_csv('datasets/titanic/test.csv', index_col = 'PassengerId')

''' Preprocessing dataset '''
#Delete some feature
train_set = train_set.drop(['Cabin', 'Ticket', 'Name'], axis=1)

#Fill missed cells in 'Age'
train_set['Age'].fillna(train_set['Age'].mean(), inplace=True)

# Fill missed cells in 'Embarked'
train_set['Embarked'] = train_set['Embarked'].apply(cat2num)

# Encode categorical features to numerical
label_encoder = LabelEncoder()
label_encoder.fit(train_set['Sex'])
train_set['Sex'] = label_encoder.transform(train_set['Sex'])

In [16]:
X = train_set.drop('Survived', axis=1).values
y = train_set['Survived'].values

In [67]:
# Creating Random Forest model
randomforest = RandomForestClassifier(n_estimators = 1000, criterion ='gini',
                               max_features = 'auto', min_samples_split=2, max_depth=15, random_state=42, n_jobs=-1)
# Creating Logistic Regression model
logreg = LogisticRegression(penalty='l2', solver='liblinear')

In [70]:
print('Average Empirical Risk using Logistic Regression model: ')
logreg.fit(X,y)
cross_validation(train_set,model=logreg)

Average Empirical Risk using Logistic Regression model: 
Average Empirical Risk (TRAIN_SET): 118.0
Average Empirical Risk (TEST_SET): 60.333333333333336


(118.0, 60.333333333333336)

In [69]:
# Tuning the hyperparmeters
n_forests = [8, 15, 50, 100, 1000] # n_estimators
n_level   = [5, 10, 15, 20, 35]     # max_depth
n_sample_coef = [2, 2, 3, 3, 4]
print('CROSS VALIDATION RESULT FOR DIFFERENT HYPERPARAMETERS: ')
print('\n\n')
for i in range(len(n_level)):
    randomforest = RandomForestClassifier(n_estimators = n_forests[i], criterion ='gini',
                               max_features = 'auto', min_samples_split=n_sample_coef[i], max_depth=n_level[i], random_state=42, n_jobs=-1)
    print('HYPERPARAMETERS: ', 'n_estimators: ' + str(n_forests[i]),'|', 'max_depth:' \
          + str(n_level[i]), '|','min_samples_split: ' + str(n_sample_coef[i]))
    print(str(cross_validation(data4cv=train_set, model=randomforest)))
    print('\n')

CROSS VALIDATION RESULT FOR DIFFERENT HYPERPARAMETERS: 



HYPERPARAMETERS:  n_estimators: 8 | max_depth:5 | min_samples_split: 2
Average Empirical Risk (TRAIN_SET): 89.0
Average Empirical Risk (TEST_SET): 55.666666666666664
(89.0, 55.666666666666664)


HYPERPARAMETERS:  n_estimators: 15 | max_depth:10 | min_samples_split: 2
Average Empirical Risk (TRAIN_SET): 35.333333333333336
Average Empirical Risk (TEST_SET): 52.666666666666664
(35.333333333333336, 52.666666666666664)


HYPERPARAMETERS:  n_estimators: 50 | max_depth:15 | min_samples_split: 3
Average Empirical Risk (TRAIN_SET): 19.666666666666668
Average Empirical Risk (TEST_SET): 54.333333333333336
(19.666666666666668, 54.333333333333336)


HYPERPARAMETERS:  n_estimators: 100 | max_depth:20 | min_samples_split: 3
Average Empirical Risk (TRAIN_SET): 14.666666666666666
Average Empirical Risk (TEST_SET): 57.666666666666664
(14.666666666666666, 57.666666666666664)


HYPERPARAMETERS:  n_estimators: 1000 | max_depth:35 | min_samples_spli

** In this case (titanic dataset) the best result was recieved using Random Forest model (with hyperparameter about 15-20 for max_depth and 1000 forests number). Validations of models were compared by cross validation method (k = 3 - number of subsamples group)**