In [None]:
!pip install fastai==0.7.0

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from fastai.imports import *
from fastai.structured import *
    
import warnings
warnings.filterwarnings("ignore")

  from numpy.core.umath_tests import inner1d


### Preprocessing includes one-hot encoding for all the categoricals except for hotel ratings. That is label encoded. This cmobinations allows for a good feature space.

In [3]:
def load_and_prepare_the_train_set():
    df_train = pd.read_csv("train.csv")
    df_train = pd.get_dummies(df_train, columns=['earthling_country','period_of_stay','earthling_type','swimming_pool','exercise_room','basketball_court',\
        'yoga_classes','club','free_wifi','hotel_name','earthling_continent','review_month','review_weekday'])

    # Instantiate LabelEncoder
    le=LabelEncoder()

    # Iterate over all the values of each column and extract their dtypes
    for col in df_train.columns.values:
    # Compare if the dtype is object
        if df_train[col].dtypes=='object':
        # Use LabelEncoder to do the numeric transformation
            df_train[col]=le.fit_transform(df_train[col])
    
    df_train_x = df_train.drop(['score'], axis=1)
    df_train_y = df_train['score']
    
    return df_train, df_train_x, df_train_y

In [4]:
df_train, df_train_x, df_train_y = load_and_prepare_the_train_set()
print(len(df_train.columns))
print(len(df_train_x.columns))

113
112


# Training and validation split with 80:20 ratio

In [5]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

df_train, df_train_x, df_train_y = load_and_prepare_the_train_set()

n_valid = 70  
n_trn = len(df_train_x)-n_valid
raw_train, raw_valid = split_vals(df_train, n_trn)
X_train, X_valid = split_vals(df_train_x, n_trn)
y_train, y_valid = split_vals(df_train_y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((282, 112), (282,), (70, 112))

# Baselining on leave-out cross validation with random forest

In [6]:
rf = RandomForestClassifier(n_estimators=40, min_samples_leaf=5, max_features=0.6,oob_score=True, n_jobs=-1, random_state=7)
loocv = model_selection.LeaveOneOut()
results = model_selection.cross_val_score(rf, df_train_x, df_train_y, cv=loocv)
print("Accuracy: %.3f%% " % (results.mean()*100.0))

Accuracy: 48.580% 


In [7]:
rf.fit(X_train, y_train)
rf.score(X_valid,y_valid)*100.0

38.57142857142858

# Now same thing with gradient boosting machine

In [8]:
gbm = GradientBoostingClassifier(n_estimators=40, random_state=7)
loocv = model_selection.LeaveOneOut()
results = model_selection.cross_val_score(gbm, df_train_x, df_train_y, cv=loocv)
print("Accuracy: %.3f%% " % (results.mean()*100.0))

Accuracy: 48.864% 


In [9]:
gbm.fit(X_train, y_train)
gbm.score(X_valid,y_valid)*100.0

42.857142857142854

> GBM seems to be a better choice. So let's select the most important features as perceived by GBM.

In [10]:
fi = rf_feat_importance(gbm, X_train); fi[:10]
to_keep = fi[fi.imp>0.005].cols; len(to_keep)

47

> Let's keep these 47 features and split the dataset once again.

In [11]:
df_train, df_train_x, df_train_y = load_and_prepare_the_train_set()
df_train_x = df_train[to_keep].copy()

n_valid = 70  
n_trn = len(df_train_x)-n_valid
raw_train, raw_valid = split_vals(df_train, n_trn)
X_train, X_valid = split_vals(df_train_x, n_trn)
y_train, y_valid = split_vals(df_train_y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((282, 47), (282,), (70, 47))

## Fitting the GBM to the dataset with adjusted features

In [65]:
gbm.fit(X_train, y_train)
gbm.score(X_valid,y_valid)*100.0

44.285714285714285

In [66]:
# Random forest
rf.fit(X_train,y_train)
rf.score(X_valid,y_valid)*100.0

41.42857142857143

In [None]:
from scipy.cluster import hierarchy as hc
corr = np.round(scipy.stats.spearmanr(df_train_x).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(20,30))
dendrogram = hc.dendrogram(z, labels=df_train_x.columns, orientation='left', leaf_font_size=16)
plt.show()

# Hyperparameter tuning

In [None]:
gbm

# Test 1

In [13]:
from sklearn.model_selection import GridSearchCV

In [68]:
param_test1 = {'n_estimators':range(20,81,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,\
                                                               min_samples_leaf=50,max_depth=8,max_features='sqrt',\
                                                               subsample=0.8,random_state=7), 
param_grid = param_test1, scoring='accuracy',n_jobs=-1,iid=False, cv=model_selection.LeaveOneOut())
%time gsearch1.fit(df_train_x,df_train_y)

gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

CPU times: user 4.52 s, sys: 248 ms, total: 4.77 s
Wall time: 4min 20s


([mean: 0.44886, std: 0.49738, params: {'n_estimators': 20},
  mean: 0.44886, std: 0.49738, params: {'n_estimators': 30},
  mean: 0.44886, std: 0.49738, params: {'n_estimators': 40},
  mean: 0.44886, std: 0.49738, params: {'n_estimators': 50},
  mean: 0.44886, std: 0.49738, params: {'n_estimators': 60},
  mean: 0.44886, std: 0.49738, params: {'n_estimators': 70},
  mean: 0.44886, std: 0.49738, params: {'n_estimators': 80}],
 {'n_estimators': 20},
 0.44886363636363635)

In [70]:
gsearch1.best_estimator_.score(X_valid, y_valid) * 100.0

41.42857142857143

# Test 2

In [14]:
param_test2 = {'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test2, scoring='accuracy',n_jobs=-1,iid=False, cv=model_selection.LeaveOneOut())
%time gsearch2.fit(df_train_x,df_train_y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

CPU times: user 25 s, sys: 456 ms, total: 25.5 s
Wall time: 23min 30s


([mean: 0.44886, std: 0.49738, params: {'max_depth': 5, 'min_samples_split': 200},
  mean: 0.44886, std: 0.49738, params: {'max_depth': 5, 'min_samples_split': 400},
  mean: 0.44886, std: 0.49738, params: {'max_depth': 5, 'min_samples_split': 600},
  mean: 0.44886, std: 0.49738, params: {'max_depth': 5, 'min_samples_split': 800},
  mean: 0.44886, std: 0.49738, params: {'max_depth': 5, 'min_samples_split': 1000},
  mean: 0.45739, std: 0.49818, params: {'max_depth': 7, 'min_samples_split': 200},
  mean: 0.44886, std: 0.49738, params: {'max_depth': 7, 'min_samples_split': 400},
  mean: 0.44886, std: 0.49738, params: {'max_depth': 7, 'min_samples_split': 600},
  mean: 0.44886, std: 0.49738, params: {'max_depth': 7, 'min_samples_split': 800},
  mean: 0.44886, std: 0.49738, params: {'max_depth': 7, 'min_samples_split': 1000},
  mean: 0.45170, std: 0.49766, params: {'max_depth': 9, 'min_samples_split': 200},
  mean: 0.44886, std: 0.49738, params: {'max_depth': 9, 'min_samples_split': 400},
  

In [15]:
gsearch2.best_estimator_.score(X_valid, y_valid) * 100.0

61.42857142857143

# Test 3

In [52]:
param_test3 = {'min_samples_split':range(1000,2100,200), 'min_samples_leaf':range(30,71,10)}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60,max_depth=9,max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test3, scoring='accuracy',n_jobs=-1,iid=False, cv=model_selection.LeaveOneOut())
%time gsearch3.fit(df_train_x,df_train_y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

CPU times: user 25.5 s, sys: 1.78 s, total: 27.3 s
Wall time: 22min 4s


([mean: 0.44886, std: 0.49738, params: {'min_samples_leaf': 30, 'min_samples_split': 1000},
  mean: 0.44886, std: 0.49738, params: {'min_samples_leaf': 30, 'min_samples_split': 1200},
  mean: 0.44886, std: 0.49738, params: {'min_samples_leaf': 30, 'min_samples_split': 1400},
  mean: 0.44886, std: 0.49738, params: {'min_samples_leaf': 30, 'min_samples_split': 1600},
  mean: 0.44886, std: 0.49738, params: {'min_samples_leaf': 30, 'min_samples_split': 1800},
  mean: 0.44886, std: 0.49738, params: {'min_samples_leaf': 30, 'min_samples_split': 2000},
  mean: 0.44886, std: 0.49738, params: {'min_samples_leaf': 40, 'min_samples_split': 1000},
  mean: 0.44886, std: 0.49738, params: {'min_samples_leaf': 40, 'min_samples_split': 1200},
  mean: 0.44886, std: 0.49738, params: {'min_samples_leaf': 40, 'min_samples_split': 1400},
  mean: 0.44886, std: 0.49738, params: {'min_samples_leaf': 40, 'min_samples_split': 1600},
  mean: 0.44886, std: 0.49738, params: {'min_samples_leaf': 40, 'min_samples_spl

In [54]:
gsearch3.best_estimator_.score(X_valid, y_valid) * 100.0

41.42857142857143

# Test 4

In [None]:
param_test4 = {'max_features':range(7,20,2)}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60,max_depth=9, min_samples_split=1200, min_samples_leaf=60, subsample=0.8, random_state=10),
param_grid = param_test4, scoring='accuracy',n_jobs=-1,iid=False, cv=model_selection.LeaveOneOut())
gsearch4.fit(df_train_x,df_train_y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

# Test 5

In [None]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60,max_depth=9,min_samples_split=1200, min_samples_leaf=60, subsample=0.8, random_state=10,max_features=7),
param_grid = param_test5, scoring='accuracy',n_jobs=-1,iid=False, cv=model_selection.LeaveOneOut())
gsearch5.fit(df_train_x,df_train_y)
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

In [16]:
df_test = pd.read_csv("test.csv")
df_test = pd.get_dummies(df_test, columns=['earthling_country','period_of_stay','earthling_type','swimming_pool','exercise_room','basketball_court',\
        'yoga_classes','club','free_wifi','hotel_name','earthling_continent','review_month','review_weekday'])


le=LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in df_test.columns.values:
# Compare if the dtype is object
    if df_test[col].dtypes=='object':
    # Use LabelEncoder to do the numeric transformation
        df_test[col]=le.fit_transform(df_test[col])
    
df_train, df_train_x, df_train_y = load_and_prepare_the_train_set()
df_test = df_train[to_keep].copy()

In [17]:
preds = gsearch2.best_estimator_.predict(df_test)

In [18]:
import os
os.makedirs('submission', exist_ok=True) 


In [21]:
submission = pd.DataFrame(preds,columns=['score'])
submission.to_csv('./submission/submission.csv',index=False)

In [None]:
!ls

In [None]:
df = pd.read_csv('submission/submission.csv')
df