In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from fastai.imports import *
from fastai.structured import *

import warnings
warnings.filterwarnings("ignore")

In [None]:
df_train = pd.read_csv("train.csv")

df_train = pd.get_dummies(df_train, columns=['earthling_country','period_of_stay','earthling_type','swimming_pool','exercise_room','basketball_court',\
                                            'yoga_classes','club','free_wifi','hotel_name','earthling_continent','review_month','review_weekday'])

from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder
le=LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in df_train.columns.values:
    # Compare if the dtype is object
    if df_train[col].dtypes=='object':
    # Use LabelEncoder to do the numeric transformation
        df_train[col]=le.fit_transform(df_train[col])
        



In [None]:
df_train_x = df_train.drop(['score'], axis=1)
df_train_y = df_train['score']

# Baseline model
m = RandomForestClassifier(n_estimators=30, n_jobs=-1)
loocv = model_selection.LeaveOneOut()
results = model_selection.cross_val_score(m, df_train_x, df_train_y, cv=loocv)

In [None]:
feature_importance = rf_feat_importance(m, df_train_x); fi[:10]

to_keep = feature_importance[feature_importance.imp>0.005].cols; len(to_keep)

df_train_x = df_train[to_keep].copy()

# RF with new feature set
results_new = model_selection.cross_val_score(m, df_train_x, df_train_y, cv=loocv)

In [None]:
# Hyperparameter tuning

rf = RandomForestClassifier(n_estimators=40, min_samples_leaf=5, max_features=0.6,oob_score=True, n_jobs=-1, random_state=7)
loocv = model_selection.LeaveOneOut()
results_after_tuning = model_selection.cross_val_score(rf, df_train_x, df_train_y, cv=loocv)

In [None]:
gbm = GradientBoostingClassifier(n_estimators=40, random_state=7)
results_with_gbm = model_selection.cross_val_score(gbm, df_train_x, df_train_y, cv=loocv)

In [None]:
df_test_pd = pd.read_csv('test.csv')

df_test_pd = pd.get_dummies(df_test_pd, columns=['earthling_country','period_of_stay','earthling_type','swimming_pool','exercise_room','basketball_court',\
                                            'yoga_classes','club','free_wifi','hotel_name','earthling_continent','review_month','review_weekday'])

In [None]:
# Iterate over all the values of each column and extract their dtypes
for col in df_test_pd.columns.values:
    # Compare if the dtype is object
    if df_test_pd[col].dtypes=='object':
    # Use LabelEncoder to do the numeric transformation
        df_test_pd[col]=le.fit_transform(df_test_pd[col])

In [None]:
df_test_pd = df_test_pd[to_keep].copy()

rf.fit(df_train_x, df_train_y)
rf_preds = rf.predict(df_test_pd)
rf_preds = pd.DataFrame(rf_preds)
rf_preds.to_csv('rf_preds_submission.csv', header=False, index=False)

gbm.fit(df_train_x, df_train_y)
gbm_preds = gbm.predict(df_test_pd)
gbm_preds = pd.DataFrame(gbm_preds)
gbm_preds.to_csv('gbm_preds_submission.csv', header=False, index=False)