In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import joblib

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, f1_score

from imblearn.over_sampling import RandomOverSampler

  from pandas import MultiIndex, Int64Index


In [2]:
tanz_df = pd.read_csv('data/Train.csv', index_col='Tour_ID')

# I drop duplicates, just in case
tanz_df.drop_duplicates(inplace=True)

# I also import the test_data
X_test = pd.read_csv('data/Test.csv', index_col='Tour_ID')

In [3]:
# I drop rows where total_female and/or total_male are null

tanz_df.dropna(subset=['total_female', 'total_male'], inplace=True)
X_test.dropna(subset=['total_female', 'total_male'], inplace=True)

In [4]:
# I correct duplicate names and one of the main_activity values spelled wrong

tanz_df['country'] = tanz_df['country'].apply(lambda x: 'BULGARIA' if x=='BURGARIA' else 'UAE' if x=='UNITED ARAB EMIRATES' else x)
X_test['country'] = X_test['country'].apply(lambda x: 'BULGARIA' if x=='BURGARIA' else 'UAE' if x=='UNITED ARAB EMIRATES' else x)

tanz_df['main_activity'] = tanz_df['main_activity'].apply(lambda x: 'Wildlife Tourism' if x=='Widlife Tourism' else x)
X_test['main_activity'] = X_test['main_activity'].apply(lambda x: 'Wildlife Tourism' if x=='Widlife Tourism' else x)

In [5]:
# I divide train (90%) and validation (10%) sets 

X = tanz_df.drop(['cost_category'], axis=1)
y = pd.DataFrame(data=tanz_df['cost_category'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, stratify=y, random_state=25)

In [6]:
# I run random oversampling since the label classes are unbalanced
over_sampler = RandomOverSampler(random_state=25)
X_train_over, y_train_over = over_sampler.fit_resample(X_train, y_train)

In [7]:
# I run ordinal encoder to encode the categorical classes into numerical ones

ordinal_cat = [['Lower Cost','Low Cost','Normal Cost','High Cost','Higher Cost','Highest Cost']]

ordenc = OrdinalEncoder(categories=ordinal_cat, dtype='int')

y_train_proc = ordenc.fit_transform(y_train_over)
y_valid_proc = ordenc.transform(y_valid)

In [8]:
numeric_cols = ['total_female','total_male','night_mainland','night_zanzibar']
inputer_cols = ['travel_with']
categoric_cols = X_train.drop(['total_female','total_male','night_mainland','night_zanzibar','travel_with'], axis=1).columns


inputer_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy='constant', fill_value='Alone')),
        ("ohencode", OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore', dtype='int'))
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("ohencode", OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore', dtype='int'))
    ]
)
numeric_pipeline = Pipeline(
    steps=[("scale", RobustScaler())]
)

ct = ColumnTransformer(transformers=[
    ('inputer_pipe', inputer_pipeline, inputer_cols),
    ('categ_pipe', categorical_pipeline, categoric_cols),
    ('num_pipe', numeric_pipeline, numeric_cols)])

final_pipe = Pipeline(steps=[
    ('coltr', ct),
    ('model', xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss',
                                use_label_encoder=False, n_jobs=-1, random_state=25,
                               alpha= 4, colsample_bytree= 0.7, gamma= 2, max_depth= 4, 
                               min_child_weight= 7, subsample= 0.5))
])


final_pipe.fit(X_train_over, y_train_proc)


y_preds = pd.DataFrame(final_pipe.predict(X_test),
                      columns=['num_predictions'],
                      index = X_test.index)


y_preds = pd.DataFrame(data = ordenc.inverse_transform(y_preds),
                       columns = ['PredictedClass'],
                       index = y_preds.index)

y_preds.index.name='Test_ID'

y_preds



Unnamed: 0_level_0,PredictedClass
Test_ID,Unnamed: 1_level_1
tour_idynufedne,Normal Cost
tour_id9r3y5moe,Higher Cost
tour_idf6itml6g,High Cost
tour_id99u4znru,Low Cost
tour_idj4i9urbx,Lower Cost
...,...
tour_id2deyfjhq,Low Cost
tour_idlenv2rio,Lower Cost
tour_id7wwqrs0p,High Cost
tour_idx80vbw5a,Normal Cost


In [9]:
y_preds.to_csv('Submission.csv')

In [10]:
joblib.dump(final_pipe, "{sampontiroli}_model.joblib")

['{sampontiroli}_model.joblib']