In [2]:
import pandas as pd
import numpy as np

import os 
import matplotlib.pyplot as plt 
from google.colab import drive
from lightgbm import LGBMClassifier, LGBMRegressor

from random import seed
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [None]:
def set_seed(seed_number):
    np.random.seed(seed_number)
    seed(seed_number)

set_seed(42)

In [None]:
path_to_data = "Zindi/Tanzania_Tourism/" # Path to `Train.csv`, `Test.csv` and `SampleSubmission.csv` files.

In [None]:
def get_root_path(data_path):
    drive.mount('/content/gdrive')
    root_path = os.path.join('gdrive/My Drive/' + data_path)
    return root_path 

def next_output_file_name(path):
    if len(os.walk(path).__next__()[2]) > 0:
        next_file = len(os.walk(path).__next__()[2]) + 1
    else:
        next_file = 1
    next_file_name = "submission_" + str(next_file) + ".csv"
    return next_file_name

In [None]:
root_path = get_root_path(path_to_data)
output_path = os.path.join(root_path, "output")

train = pd.read_csv(os.path.join(root_path + 'Train.csv'))
test = pd.read_csv(os.path.join(root_path + 'Test.csv'))
ss = pd.read_csv(os.path.join(root_path + 'SampleSubmission.csv'))

display(train.head())
display(test.head())
display(ss)

In [None]:
train_sample = train.shape[0]
data = pd.concat([train, test], axis=0)

In [None]:
data.isnull().sum()

In [None]:
data['travel_with'] = data['travel_with'].fillna('Alone')
data['total_female'] = data['total_female'].fillna(0)
data['total_male'] = data['total_male'].fillna(0)
data['most_impressing'] = data['most_impressing'].fillna('No comments')

In [None]:
data.isnull().sum()

In [None]:
data.head()

In [None]:
le_country = LabelEncoder()
data['country'] = le_country.fit_transform(data['country'])

le_age_group = LabelEncoder()
data['age_group'] = le_age_group.fit_transform(data['age_group'])

le_travel_with = LabelEncoder()
data['travel_with'] = le_travel_with.fit_transform(data['travel_with'])

le_purpose = LabelEncoder()
data['purpose'] = le_purpose.fit_transform(data['purpose'])

le_main_activity = LabelEncoder()
data['main_activity'] = le_main_activity.fit_transform(data['main_activity'])

le_info_source = LabelEncoder()
data['info_source'] = le_info_source.fit_transform(data['info_source'])

le_tour_arrangement = LabelEncoder()
data['tour_arrangement'] = le_tour_arrangement.fit_transform(data['tour_arrangement'])

le_package_transport_int = LabelEncoder()
data['package_transport_int'] = le_package_transport_int.fit_transform(data['package_transport_int'])

le_package_accomodation = LabelEncoder()
data['package_accomodation'] = le_package_accomodation.fit_transform(data['package_accomodation'])

le_package_food = LabelEncoder()
data['package_food'] = le_package_food.fit_transform(data['package_food'])

le_package_transport_tz = LabelEncoder()
data['package_transport_tz'] = le_package_transport_tz.fit_transform(data['package_transport_tz'])

le_package_sightseeing = LabelEncoder()
data['package_sightseeing'] = le_package_sightseeing.fit_transform(data['package_sightseeing'])

le_package_guided_tour = LabelEncoder()
data['package_guided_tour'] = le_package_guided_tour.fit_transform(data['package_guided_tour'])

le_package_insurance = LabelEncoder()
data['package_insurance'] = le_package_insurance.fit_transform(data['package_insurance'])

le_night_mainland = LabelEncoder()
data['night_mainland'] = le_night_mainland.fit_transform(data['night_mainland'])

le_night_zanzibar = LabelEncoder()
data['night_zanzibar'] = le_night_zanzibar.fit_transform(data['night_zanzibar'])

le_payment_mode = LabelEncoder()
data['payment_mode'] = le_payment_mode.fit_transform(data['payment_mode'])

le_first_trip_tz = LabelEncoder()
data['first_trip_tz'] = le_first_trip_tz.fit_transform(data['first_trip_tz'])

le_most_impressing = LabelEncoder()
data['most_impressing'] = le_most_impressing.fit_transform(data['most_impressing'])

In [None]:
data.head()

In [None]:
train = data[:train_sample].copy()
test = data[train_sample:].copy()

In [None]:
display(train.head())
test =  test.drop(columns=['total_cost'])
display(test.head())

In [None]:
train["kfold"] = -1
splits = 10
    
train = train.sample(frac=1, random_state=42).reset_index(drop=True)
y = train.total_cost.values

kf = model_selection.KFold(n_splits=splits)
    
for f, (t_, v_) in enumerate(kf.split(X=train, y=y)):
  train.loc[v_, 'kfold'] = f

mae_list = []

for fold_ in range(splits):
  # temporary dataframes for train and test
  train_df = train[train.kfold != fold_].reset_index(drop=True)
  test_df = train[train.kfold == fold_].reset_index(drop=True)

  model = LGBMRegressor(random_state = 42, num_leaves= 25, max_depth=8, subsample=0.95, boosting_type='dart', num_iterations=200, min_data_in_leaf=10)
  model_1 = LinearRegression()
  
  # model_2 = AdaBoostClassifier(base_estimator = LGBMClassifier(), random_state = 42)
  # fit the model on training data and sentiment
  model.fit(train_df.drop(columns=['ID', 'kfold', 'total_cost']), train_df.total_cost)
    
  preds = model.predict(test_df.drop(columns=['ID', 'kfold', 'total_cost']))
  # calculate accuracy
  mae = metrics.mean_absolute_error(test_df.total_cost, preds)
  mae_list.append(mae)
  print(f"Fold: {fold_}")
  print(f"MAE = {mae}")

  print("")    
print(f"MAE Summary: {np.sum(mae_list)/len(mae_list)}")    

In [None]:
test_preds = model.predict(test.drop(columns=['ID']))
test_preds

In [None]:
test['total_cost'] = test_preds
test

In [None]:
sub_file = test[['ID', 'total_cost']]
sub_file.columns = ['test_id', 'total_cost']
sub_file

In [None]:
sub_file.to_csv(os.path.join(root_path + 'output/submission_10.csv'), index = False)