In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")
df_sample = pd.read_csv("../input/tabular-playground-series-jan-2022/sample_submission.csv")

df_train.head()

In [None]:
targets = df_train.iloc[:, -1]
df_train.drop('num_sold', axis=1)

In [None]:
features = pd.DataFrame()

In [None]:
df_train.columns

In [None]:
df_train['product'].unique()

In [None]:
country = {'Finland':0, 'Norway':1, 'Sweden':2}
features['country'] = df_train.country.map(country)
store = {'KaggleMart':0, 'KaggleRama':1}
features['store'] = df_train.store.map(store)
product = {'Kaggle Mug':0, 'Kaggle Hat':1, 'Kaggle Sticker':2}
features['product'] = df_train['product'].map(product)

features.head()

In [None]:
features.corrwith(targets)

In [None]:
# Normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler

features = MinMaxScaler().fit_transform(features)

In [None]:
# Applying the same mechanism to the test set
test_set = pd.DataFrame()
test_set['country'] = df_test.country.map(country)
test_set['store'] = df_test.store.map(store)
test_set['product'] = df_test['product'].map(product)

test_set.head()

In [None]:
# import matplotlib.pyplot as plt

# plt.figure(dpi=100)
# plt.plot(targets[:500])
# plt.show

In [None]:
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV, GridSearchCV
X_train, X_valid, y_train, y_valid = train_test_split(features, targets, test_size=0.2, random_state=1334)

In [None]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

In [None]:
# define the parameter space that will be searched over
# param_distribution = {"n_estimators": np.random.randint(1, 101, 5),
#                      "max_depth": np.random.randint(1, 10, 5),
#                      "max_leaf_nodes":np.random.randint(2, 10, 5),
#                      }

# # now create a searchCV object and fit it to that data
# search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
#                            param_distributions=param_distribution,
#                            n_iter=5)

# rf_model = search.fit(X_train, y_train)
# # call rf_model.best_params to get the parameters obtained by the randomized search

# using the best values obtained
# prev n_e = 66
rf_best_model = RandomForestRegressor(n_estimators=100,
                                     max_leaf_nodes=4, 
                                     max_depth=9).fit(X_train, y_train)

rf_best_predictions = rf_best_model.predict(X_valid)

In [None]:
# ada_distributions = {"n_estimators":np.random.randint(50, 100, 5),
#                     "learning_rate":np.linspace(1e-5, 1, 5),
#                     "loss":['linear', 'square']}

# ada_search = RandomizedSearchCV(estimator=AdaBoostRegressor(random_state=0),
#                                param_distributions=ada_distributions,
#                                n_iter=5)

# ada_model = ada_search.fit(X_train, y_train)
# prev n_e = 62
ada_best_model = AdaBoostRegressor(n_estimators=100, loss='linear', learning_rate=1.0).fit(X_train, y_train)
ada_best_prediction = ada_best_model.predict(X_valid)

In [None]:
# grad_params = {"loss":['ls', 'huber'],
#               "learning_rate":np.linspace(1e-5, 1, 10)}

# grad_search = RandomizedSearchCV(estimator=GradientBoostingRegressor(random_state=0),
#                                 param_distributions=grad_params,
#                                 n_iter=5)

# grad_model = grad_search.fit(X_train, y_train)
# prev lr = 0.556
grad_best_model = GradientBoostingRegressor(loss='huber',
                                           learning_rate=0.1,
                                           max_depth=7,
                                           n_estimators=500,
                                           min_samples_leaf=5).fit(X_train, y_train)
grad_best_prediction = grad_best_model.predict(X_valid)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def print_metrics(name, pred, true_val):
    print(name)
    print('.'*30)
    print('MAE: ', mean_absolute_error(pred, true_val))
    print('MSE: ', mean_squared_error(pred, true_val))
    print()
    
print_metrics(name='Random Forest', pred=rf_best_predictions, true_val=y_valid)
print_metrics(name='Ada boost', pred=ada_best_prediction, true_val=y_valid)
print_metrics(name='Gradient boosting', pred=grad_best_prediction, true_val=y_valid)

In [None]:
# From the evaluation, gradient boostin performs better than the other two
# So, I will be using it 
predictions = grad_best_model.predict(test_set)

df_sample.num_sold = np.round(predictions,0).astype(int)
df_sample.to_csv("submission.csv", index=False)

In [None]:
df_sample.head()