The task is to forecast the total amount of products sold in every shop for the test set. Note that the list of shops and products slightly changes every month. Creating a robust model that can handle such situations is part of the challenge.

# **Importing libraries** 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings("ignore")
from wordcloud import WordCloud
from wordcloud import STOPWORDS
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# **Reading the csv files**

In [None]:
item_categories = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
train = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

# **item_categories**

In [None]:
item_categories.head()

In [None]:
item_categories.shape

In [None]:
item_categories.info()

In [None]:
item_categories.describe()

In [None]:
sns.heatmap(item_categories.isnull())
plt.show()

# **items**

In [None]:
items.head()

In [None]:
items.shape

In [None]:
items.info()

In [None]:
items.describe()

In [None]:
col1 = sns.color_palette("mako", as_cmap=True)
sns.heatmap(items.isnull(), cmap = col1)
plt.show()

# **Shops**

In [None]:
shops.head()

In [None]:
shops.shape

In [None]:
shops.info()

In [None]:
shops.describe()

In [None]:
col2 = sns.color_palette("viridis", as_cmap=True)
sns.heatmap(shops.isnull(), cmap = col2)
plt.show()

# **Train**

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
col3 = sns.cubehelix_palette(as_cmap=True)
sns.heatmap(train.isnull(), cmap = col3)
plt.show()

# **Test**

In [None]:
test.head()

In [None]:
test.shape

In [None]:
test.info()

In [None]:
test.describe()

In [None]:
col4 = sns.cubehelix_palette(start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
sns.heatmap(test.isnull(), cmap = col4)
plt.show()

In [None]:
# Looking at the number of different categories

plt.rcParams['figure.figsize'] = (25,10)
sns.barplot(items['item_category_id'],items['item_id'], palette = 'colorblind')
plt.title('Count for Different Items Categories', fontsize = 30)
plt.xlabel("Item Categories", fontsize = 15)
plt.ylabel("Items in each Categories", fontsize = 15)
plt.show()

In [None]:
# having a look at the distribution of item sold per day

plt.rcParams["figure.figsize"] = (25,10)
sns.countplot(train['date_block_num'])
plt.title('Date Blocks according to months', fontsize = 30)
plt.xlabel("Different blocks of months", fontsize = 15)
plt.ylabel("No. of purchases", fontsize = 15)
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (13,7)
sns.distplot(train["item_price"], color = "red")
plt.title('Distribution of the price of items',fontsize = 30)
plt.xlabel("Range of price of items", fontsize = 15)
plt.ylabel("Distribution of prices over items", fontsize = 15)
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (13,7)
sns.distplot(train["item_cnt_day"], color = "purple")
plt.title('Distribution of the no. of items Sold per Day',fontsize = 30)
plt.xlabel("Range of items sold per day", fontsize = 15)
plt.ylabel("Distributions per day", fontsize = 15)
plt.show()

In [None]:
# checking the no. of unique item present in the stores
x = train["item_id"].nunique()
print("The No. of Unique items present in the stores available",x)

In [None]:
# checking the no. of unique items categories present in the stores
x2 = item_categories["item_category_id"].nunique()
print("The No. of Unique items present in the stores available",x2)

In [None]:
# checking the no. of unique shops given in the dataset
x3 = train["shop_id"].nunique()
print("The No. of Unique items present in the stores available",x3)

# **Word cloud for Item Categories name**

In [None]:
plt.rcParams["figure.figsize"] = (15,12)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'lightblue',
                     max_words = 200, stopwords = stopwords,
                     width = 1200, height = 800, random_state = 42).\
generate(str(item_categories["item_category_name"]))
plt.title("WordCloud for item Category Names", fontsize = 30)
plt.axis("off")
plt.imshow(wordcloud, interpolation = "bilinear")
plt.show()

# **Word Cloud for Item Name**

In [None]:
plt.rcParams["figure.figsize"] = (15,12)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'orange',
                     max_words = 200, stopwords = stopwords,
                     width = 1200, height = 800, random_state = 42).\
generate(str(items["item_name"]))
plt.title("WordCloud for item Category Names", fontsize = 30)
plt.axis("off")
plt.imshow(wordcloud, interpolation = "bilinear")
plt.show()

# **Word Cloud for Shop Name**

In [None]:
plt.rcParams["figure.figsize"] = (15,12)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = "violet",
                     max_words = 200, stopwords = stopwords,
                     width = 1200, height = 800, random_state = 42).\
generate(str(shops["shop_name"]))
plt.title("WordCloud for Shop Names", fontsize = 30)
plt.axis("off")
plt.imshow(wordcloud, interpolation = "bilinear")
plt.show()

In [None]:
train['date'] = pd.to_datetime(train.date)

In [None]:
# making a new column day
train['day'] = train['date'].dt.day

# making a new column month
train['month'] = train['date'].dt.month

# making a new column year
train['year'] = train['date'].dt.year

# making a new column week
train['week'] = train['date'].dt.week

# checking the new columns
train.columns

In [None]:
# checking which days are most busisiest for the shops

plt.rcParams["figure.figsize"] = (15,7)
sns.countplot(train['day'], palette = "Dark2")
plt.title('The most busisiet days for the shops', fontsize = 30)
plt.xlabel("Days", fontsize = 15)
plt.ylabel("Frequency", fontsize = 15)
plt.show()

In [None]:
# checking which maonths are most busisiest for the shops

plt.rcParams["figure.figsize"] = (15,7)
sns.countplot(train['month'], palette = "Set1")
plt.title('The most busisiet monthss for the shops', fontsize = 30)
plt.xlabel("Months", fontsize = 15)
plt.ylabel("Frequency", fontsize = 15)
plt.show()

In [None]:
# checking which years are most busisiet for the shops

plt.rcParams["figure.figsize"] = (15,7)
sns.countplot(train['year'], palette = "PuBu")
plt.title('The most busisiet years for the shops', fontsize = 30)
plt.xlabel("Years", fontsize = 15)
plt.ylabel("Frequency", fontsize = 15)
plt.show()

# **Feature Enginnering**

In [None]:
train["revenue"] = train["item_price"] * train["item_cnt_day"]
plt.rcParams["figure.figsize"] = (13,7)
sns.distplot(train["revenue"], color = "blue")
plt.title('Distribution of Revenue',fontsize = 30)
plt.xlabel("Range of Revenue", fontsize = 15)
plt.ylabel("Revenue", fontsize = 15)
plt.show()

In [None]:
train.dtypes

In [None]:
# plotting a violin plot for item price and item-cnt-day
plt.rcParams["figure.figsize"] = (13,7)
sns.violinplot(x = train["day"],y = train["revenue"], color = "black")
plt.title('Box plot for Days v/s Revenue',fontsize = 30)
plt.xlabel("Days", fontsize = 15)
plt.ylabel("Revenue", fontsize = 15)
plt.show()

In [None]:
# plotting a box plot for item price and item-cnt-day
plt.rcParams["figure.figsize"] = (13,7)
sns.boxplot(x = train["month"],y = train["revenue"], color = "black")
plt.title('Box plot for Months v/s Revenue',fontsize = 30)
plt.xlabel("Months", fontsize = 15)
plt.ylabel("Revenue", fontsize = 15)
plt.show()

In [None]:
# plotting a box plot for item price and item-cnt-day
plt.rcParams["figure.figsize"] = (13,7)
sns.violinplot(x = train["year"],y = train["revenue"], color = "black")
plt.title('Box plot for Years v/s Revenue',fontsize = 30)
plt.xlabel("Years", fontsize = 15)
plt.ylabel("Revenue", fontsize = 15)
plt.show()

In [None]:
# coverting the data into monthly sales data
#making a dataset with only monthy sales data
data = train.groupby([train["date"].apply(lambda x:x.strftime('%Y-%m')),'item_id','shop_id']).sum().reset_index()

#specifying the important attributes which we want to add to the data
data =  data[["date","item_id","shop_id","item_cnt_day"]]

#at last we can select the specific attributes from the dataset which are important
data = data.pivot_table(index = ["item_id","shop_id"], columns = "date",
                       values = "item_cnt_day", fill_value = 0).reset_index()

#looking at the newly prepared dataset
data.shape

In [None]:
# lets merge the monthly sales data prepared to the test data set

test = pd.merge(test, data, on = ["item_id","shop_id"], how = "left")

#filling the empty values found in the dataset
test.fillna(0, inplace = True)

#checking the dataset
test.head()

In [None]:
test = test.drop(["2015-11","2015-12"], axis =1)

In [None]:
test.head()

In [None]:
# now lets  create the actual training data

x_train = test.drop(["2015-10","item_id","shop_id"], axis = 1)
y_train = test["2015-10"]

#deleting the first column so that it can predict teh future sales data
x_test = test.drop(["2013-01","item_id","shop_id"], axis = 1)

#checking the shapes of the datasets
print("Shape of x_train", x_train.shape)
print("Shape of x_test", x_test.shape)
print("Shape of y_train", y_train.shape)

In [None]:
x_train.head()

In [None]:
x_test.head()

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train,y_train, test_size = 0.2,
                                                     random_state = 0)

#checking the shapes of the datasets
print("Shape of x_train", x_train.shape)
print("Shape of x_valid", x_valid.shape)
print("Shape of y_train", y_train.shape)
print("Shape of y_valid", y_valid.shape)

# **Modeling**

In [None]:
from xgboost import XGBRegressor
model = XGBRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    x_train, 
    y_train, 
    eval_metric="rmse", 
    eval_set=[(x_train, y_train), (x_valid, y_valid)], 
    verbose=True, 
    early_stopping_rounds = 10)

In [None]:
Y_pred = model.predict(x_valid).clip(0, 20)
Y_test = model.predict(x_test).clip(0, 20)

In [None]:
#Creating the submission file and submit
submission = pd.DataFrame({
    "ID": test.index, 
    "item_cnt_month": Y_test
})
submission.to_csv('xgb_submission.csv', index=False)