In [None]:
# linear algebra
import numpy as np 

# data processing
import pandas as pd 
from pandas import DataFrame

# data visualization
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style
import category_encoders as ce

# Algorithms and models:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Data categorization/encoding:

import category_encoders as ce 

In [None]:
data_path = '/kaggle/input/competitive-data-science-predict-future-sales/'

train_df  = pd.read_csv(data_path + 'sales_train.csv')
test_df = pd.read_csv(data_path + 'test.csv')

train_df.info()
test_df.info()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# We put the date feature from train_df in the proper format:

train_df['date']  =  pd.to_datetime(train_df['date'], format = '%d.%m.%Y')

# Let's check if there are missing values in train_df:

total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head(5)

In [None]:
# There are no missing values, let's look for outliers:

plt.scatter(train_df['item_price'].unique(),train_df.groupby(['item_price']).sum()['item_cnt_day'])

In [None]:
# there is one outlier, we remove it

train_df.drop(train_df[(train_df['item_price'] >200000)].index, inplace=True)

In [None]:
# We check if there are nonsense item prices (minor than 0)

train_df[(train_df['item_price']<0)]

In [None]:
# We remove this outlier too 

train_df.drop(train_df[train_df['item_price'] < 0].index, inplace = True)

# Sold items cannot be negative, we remove these outliers too

train_df.drop(train_df[train_df['item_cnt_day'] < 0].index, inplace = True)

In [None]:
train_df.isna().sum()

In [None]:
train_df

In [None]:
# Some feature engineering:

train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day
train_df['year'] = train_df['year'].replace({2013:1,2014:2,2015:3})

# We add categories: 

items_df = pd.read_csv(data_path + "items.csv")
train_df = pd.merge(train_df, items_df[['item_id','item_category_id']], how='left', on=['item_id'])

train_df

In [None]:
train_df = train_df[['date','year','month','day','date_block_num','shop_id','item_id','item_price','item_category_id','item_cnt_day']]

In [None]:
train_df.dtypes

In [None]:
# "Shop_id" is an integer number, but actually it is just a label. We should encode it in a more logical way.
# For this we are going to encode it based on the item_price:

encoder=ce.TargetEncoder(cols='shop_id') 
train_df['shop_id'] = encoder.fit_transform(train_df['shop_id'],train_df['item_price'])

In [None]:
# Same thing for the item_id:

encoder=ce.TargetEncoder(cols='item_id') 
train_df['item_id'] = encoder.fit_transform(train_df['item_id'],train_df['item_price'])
train_df

In [None]:
# Train_df seems to be ready. Let's work with test_df:
# We need to add "year", "month", "date_block_num" and "item_price" for it to be consistent with the features from "train_df".
# "Year" is clear (2015), month is clear (11), "date_block_num" is clear (34).

test_df.insert(0,'year',2015,True)
test_df.insert(1,'month',11,True)
test_df.insert(2,'date_block_num',34,True)
test_df.drop(['ID'], axis=1, inplace=True)
test_df

In [None]:
# For "item_price" we will start by adding the last price of the "item_id". It will not fill all the rows, but at least part of it:

item_price=dict(train_df.groupby('item_id')['item_price'].last('1D').reset_index().values)
test_df['item_price']=test_df['item_id'].map(item_price)
test_df

In [None]:
# There are still NaN values that we will average. For this, we can set the item_price with the price of this item in other shops

test_df['item_price'].fillna(train_df.groupby(['item_id']).mean()['item_price'],inplace=True)
test_df

In [None]:
test_df.isna().sum()

In [None]:
# There are still nan values in the price from "test_df", corresponding to items that whether are new or have been never sold. 
# We can fill these NaN values with the average of the products from his category. For this:

# 1) We first read the items.csv file

items_df = pd.read_csv(data_path + "items.csv")
items_df

In [None]:
# 2) We add the category to each item from the test:

test_df = pd.merge(test_df, items_df[['item_id','item_category_id']], how='left', on=['item_id'])
test_df

In [None]:
# We average the price of every category in a new dataframe:

category_prices_df = train_df[['item_category_id','item_price']].groupby(['item_category_id']).mean().reset_index()
category_prices_df

In [None]:
# And we merge it with the NaN values from "test_df"

test_df = pd.merge(test_df, category_prices_df, how='left', on=['item_category_id'])
test_df['item_price_x'].fillna(test_df['item_price_y'], inplace=True)
test_df.drop('item_price_y', axis=1, inplace=True)
test_df=test_df.rename(columns = {'item_price_x':'item_price'})
test_df['year']=3
test_df

In [None]:
# Finally we confirm that there are no NaN values anymore in test_df

test_df.isna().sum()

In [None]:
train_df

In [None]:
test_df

In [None]:
# the feature "shop_id" needs to be encoded yet, to be consistent with "train_df"

encoder=ce.TargetEncoder(cols='shop_id')
test_df['shop_id'] = encoder.fit_transform(test_df['shop_id'],test_df['item_price'])
test_df

In [None]:
# Same thing for "item_id":

encoder=ce.TargetEncoder(cols='item_id')
test_df['item_id'] = encoder.fit_transform(test_df['item_id'],test_df['item_price'])
test_df

In [None]:
X = train_df.drop(['date','day','item_cnt_day'], axis=1)

# The target values from "train_df" are too broad. We will normalize it using log:

y = np.log(train_df['item_cnt_day'])
y = y.fillna(y.median())



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [None]:
# HYPERPARAMETER OPTIMIZATION FOR XGBOOST:

# Hyperparameters I tried with cross validation by means of XGBRegressor:

# 'max_depth': [4,6,10],
# 'learning_rate': [0.01, 0.1, 1],
# 'n_estimators': [600, 1500, 2000],
# 'colsample_bytree': [0.5,0.7,1],
# 'tree_method': ['gpu_hist'],
# 'alpha': [0,0.01,0.1],
# 'gamma': [0, 0.1, 0.5],
# 'eta': [0.01,0.1,0.2],
# 'min_child_weight': [1,2,6]


# The best hyperparameters turned out to be: Best parameters: {'alpha': 0, 'colsample_bytree': 0.5, 'eta': 0.01, 'gamma': 0.1, 
# 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 1500, 'tree_method': 'gpu_hist'}


import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

params = { 'max_depth': [4],
           'learning_rate': [0.1],
           'n_estimators': [1500],
           'colsample_bytree': [0.5],
           'tree_method': ['gpu_hist'],
           'alpha': [0],
           'gamma': [0.1],
           'eta': [0.01],
           'min_child_weight': [1]
            }
xgbr = xgb.XGBRegressor(seed = 20)
clf = GridSearchCV(estimator=xgbr, 
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   verbose=3)
clf.fit(X, y)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] alpha=0, colsample_bytree=0.5, eta=0.01, gamma=0.1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=1500, tree_method=gpu_hist 
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV]  alpha=0, colsample_bytree=0.5, eta=0.01, gamma=0.1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=1500, tree_method=gpu_hist, score=-0.079, total=  58.4s
[CV] alpha=0, colsample_bytree=0.5, eta=0.01, gamma=0.1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=1500, tree_method=gpu_hist 
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   58.4s remaining:    0.0s
[CV]  alpha=0, colsample_bytree=0.5, eta=0.01, gamma=0.1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=1500, tree_method=gpu_hist, score=-0.096, total=  55.4s
[CV] alpha=0, colsample_bytree=0.5, eta=0.01, gamma=0.1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=1500, tree_method=gpu_hist 
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.9min remaining:    0.0s
[CV]  alpha=0, colsample_bytree=0.5, eta=0.01, gamma=0.1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=1500, tree_method=gpu_hist, score=-0.080, total=  52.7s
[CV] alpha=0, colsample_bytree=0.5, eta=0.01, gamma=0.1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=1500, tree_method=gpu_hist 
[CV]  alpha=0, colsample_bytree=0.5, eta=0.01, gamma=0.1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=1500, tree_method=gpu_hist, score=-0.093, total=  55.1s
[CV] alpha=0, colsample_bytree=0.5, eta=0.01, gamma=0.1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=1500, tree_method=gpu_hist 
[CV]  alpha=0, colsample_bytree=0.5, eta=0.01, gamma=0.1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=1500, tree_method=gpu_hist, score=-0.087, total= 1.0min
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.7min finished
Best parameters: {'alpha': 0, 'colsample_bytree': 0.5, 'eta': 0.01, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 1500, 'tree_method': 'gpu_hist'}
Lowest RMSE:  0.2946685995050872

In [None]:
results_df = clf.predict(test_df)

# We undo the normalized results by means of the log:

results_df = pow(10, results_df)

df = pd.DataFrame(data=results_df, columns=["item_cnt_month"])
df['ID'] = np.arange(len(df))
df.to_csv('final_file', sep=',', index=False, index_label=False )