In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# Additional Imports
import tensorflow as tf
import sklearn as sk
from matplotlib import pyplot as plt
import seaborn as sns

# Loading and Analyzing the Input Data

## Loading the data into Pandas dataframes

In [None]:
INPUT_DIR = '../input'
input_item = pd.read_csv(os.path.join(INPUT_DIR, 'items.csv'))
input_test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
input_train = pd.read_csv(os.path.join(INPUT_DIR, 'sales_train.csv'))
input_item_categories = pd.read_csv(os.path.join(INPUT_DIR, 'item_categories.csv'))
input_shops = pd.read_csv(os.path.join(INPUT_DIR, 'shops.csv'))
output_sample = pd.read_csv(os.path.join(INPUT_DIR, 'sample_submission.csv'))

This is what the problem statement says about the provided files. Let's now start the exploration.
* sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.
* test.csv - the test set. You need to forecast the sales for these shops and products for November 2015.
* sample_submission.csv - a sample submission file in the correct format.
* items.csv - supplemental information about the items/products.
* item_categories.csv - supplemental information about the items categories.
* shops.csv - supplemental information about the shops.

The Following is a list of the Data Fields and what they represent
* **ID** - an Id that represents a (Shop, Item) tuple within the test set
* **shop_id** - unique identifier of a shop
* **item_id** - unique identifier of a product
* **item_category_id** - unique identifier of item category
* **item_cnt_day** - number of products sold. You are predicting a monthly amount of this measure
* **item_price** - current price of an item
* **date** - date in format dd/mm/yyyy
* **date_block_num** - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
* **item_name** - name of item
* **shop_name** - name of shop
* **item_category_name** - name of item category

## Looking through the files

### Items and Categories

This first file gives us a list of all the items, their names, ids, and corresponding **Category IDs**. There are a total of 84 item categories, some having way more items than the others. Some have about 5000 items, some have as few as 1.

In [None]:
input_item.head()

In [None]:
print("Following are the unique Item Category IDs:\n", input_item['item_category_id'].unique(), "(" + str(len(input_item['item_category_id'].unique())) + " categories)")

sns.countplot('item_category_id', data=input_item).set_title('Number of items in Each Category')
plt.show()

### The Test Set

Following is the test data input, it's a list of all the items and the shops to which it belongs. All the **shops are equally sampled** for their items. Each *shop-item pair in the test set is unique*.

In [None]:
input_test.head()

In [None]:
sns.countplot('shop_id', data=input_test).set_title('Number of items in Each Shop')
plt.show()
print('There are a total of', len(input_test.index), 'Rows.')
print('It contains', len(input_test['item_id'].unique()), 'unique items.')
print('It contains', len(input_test['shop_id'].unique()), 'unique shops.')
print('It contains', len(input_test.groupby(['shop_id', 'item_id'])), 'unique shops-item pairs.')

### Training Data

Here is the training set for the sales data. It has, for each unique shop-item pair. This data **unlike the test set, is not uniformly sampled from all shops**. Also, this has **way more examples of low-price products than high-price products**.

In [None]:
input_train.head()

In [None]:
print('There are a total of', len(input_train.index), 'Rows.')
print('It contains', len(input_train['item_id'].unique()), 'unique items.')
print('It contains', len(input_train['shop_id'].unique()), 'unique shops.')
print('It contains', len(input_train['date_block_num'].unique()), 'date blocks.')
print('It contains', len(input_train.groupby(['shop_id', 'item_id'])), 'unique shops-item pairs.')

In [None]:
sns.countplot('shop_id', data=input_train).set_title('Number of items in Each Shop')
plt.show()
_, plot_price_axes = plt.subplots(1,2, figsize=(15, 5))
sns.distplot(input_train['item_price'], bins=100, ax=plot_price_axes[0]).set_title('Price of items (all prices)')
sns.distplot(input_train['item_price'], bins=1000, ax=plot_price_axes[1]).set_title('Price of items (lower price)')
plt.xlim(0, 10000)
plt.show()
sns.countplot('date_block_num', data=input_train).set_title('Number of items in Each Date-Block')
plt.show()

In [None]:
for i in range(100):
    query = input_train.loc[input_train['item_id'] == i]
    if (len(query) < 50): continue
    x = pd.to_datetime(query['date'])
    y = query['item_price']
    sns.lineplot(x, y).set_title('Price with time')

In [None]:
print(input_train['item_cnt_day'].unique())
input_train['item_cnt_day'].describe()

### Item Category and Shops Names

This is just a list of names, maybe we can further improve our results with some complex text processing to recognize classes of products, but this is mostly additional useless information. Nevertheless, good for fault finding when we are close to done.

In [None]:
input_item_categories.head()

In [None]:
print("Total Categories", len(input_item_categories))

In [None]:
input_shops.head()

Of course, we have a single target a real number prediction for the expected number of sales of the product.

In [None]:
output_sample.head()

# Data Insights

## Understanding Each Category

In [None]:
temp_categories = input_item['item_category_id'].values
input_train['item_category_id'] = input_train.apply(lambda x: temp_categories[x['item_id']], axis=1)

In [None]:
process_categories = pd.DataFrame(columns=('item_category_id', 'price_mean', 'items_count', 'price_min', 'price_quartile1', 
                                           'price_median', 'price_quartile3', 'price_max', 'std_dev'))
for i in range(len(input_item_categories)):
    temp_query = input_train.loc[input_train['item_category_id'] == i]['item_price']
    process_categories = process_categories.append(pd.Series([i, temp_query.mean(), temp_query.count(), temp_query.min(), temp_query.quantile(0.25), temp_query.median(), 
                                                              temp_query.quantile(0.75), temp_query.max(), temp_query.std()],
                                                             index=process_categories.columns), ignore_index=True)

pd.set_option('display.max_rows', 100)
process_categories.head(len(process_categories))

In [None]:
_, plot_category_axes = plt.subplots(2,4, figsize=(20, 10))
sns.barplot(x='item_category_id', y='price_mean', data=process_categories, ax=plot_category_axes[0][0]).set_title('Mean Price')
sns.barplot(x='item_category_id', y='items_count', data=process_categories, ax=plot_category_axes[0][1]).set_title('Number of Samples')
sns.barplot(x='item_category_id', y='price_min', data=process_categories, ax=plot_category_axes[0][2]).set_title('Min Price')
sns.barplot(x='item_category_id', y='price_quartile1', data=process_categories, ax=plot_category_axes[0][3]).set_title('First Quartile Price')
sns.barplot(x='item_category_id', y='price_median', data=process_categories, ax=plot_category_axes[1][0]).set_title('Median Price')
sns.barplot(x='item_category_id', y='price_quartile3', data=process_categories, ax=plot_category_axes[1][1]).set_title('Third Quartile Price')
sns.barplot(x='item_category_id', y='price_max', data=process_categories, ax=plot_category_axes[1][2]).set_title('Max Price')
sns.barplot(x='item_category_id', y='std_dev', data=process_categories, ax=plot_category_axes[1][3]).set_title('Price Standard Deviation')
plt.show()

# Baseline Classifier Model

## Training and Validation Sets

Here, we are splitting the data into the training set which is the first 32 months, and the validation set which is all the data from the last month.

In [None]:
temp_train = input_train[input_train['date_block_num'] < input_train['date_block_num'].max()] 
temp_valid = input_train[input_train['date_block_num'] == input_train['date_block_num'].max()]
x_train, y_train = temp_train[[col for col in temp_train.columns if 'item_cnt_day' not in col]], temp_train['item_cnt_day']
x_valid, y_valid = temp_valid[[col for col in temp_valid.columns if 'item_cnt_day' not in col]], temp_valid['item_cnt_day']

print('The Shapes are:', x_train.shape, y_train.shape, x_valid.shape, y_valid.shape)
x_train = x_train.drop('date', axis=1)
x_valid = x_valid.drop('date', axis=1)
x_train.head()

In [None]:
z_train, p_train = x_train[[col for col in  x_train.columns if 'item_price' not in col]], x_train['item_price']
z_valid, p_valid = x_valid[[col for col in  x_valid.columns if 'item_price' not in col]], x_valid['item_price']
z_train.head()

## Random Forest Classifier and Feature Importance

We need to get some estimate of how important each of the features are to the result. So we train a bunch of Random Forest regressors and see the weights they output.

In [None]:
from sklearn.ensemble import RandomForestClassifier

def forest_classifier(x_val, y_val, importance=True):
    # Build a forest and compute the feature importances
    forest = RandomForestClassifier(n_estimators=3, random_state=0)

    forest.fit(x_val, y_val)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in range(x_val.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.title("Feature importances")
    plt.bar(range(x_val.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
    plt.xticks(range(x_val.shape[1]), indices)
    plt.xlim([-1, x_val.shape[1]])
    plt.show()
    
    return forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

def forest_regressor(x_val, y_val, importance=True):
    # Build a forest and compute the feature importances
    forest = RandomForestRegressor(n_estimators=3, random_state=0)

    forest.fit(x_val, y_val)
    importances = forest.feature_importances_
    std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in range(x_val.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest
    plt.title("Feature importances")
    plt.bar(range(x_val.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
    plt.xticks(range(x_val.shape[1]), indices)
    plt.xlim([-1, x_val.shape[1]])
    plt.show()
    
    return forest

### Running the Regressors and Plotting Importance

All the predictions are numberic. Forest-1 and Forest-2 are a combination, Forest-1 predicts the cost and Forest-2 predicts the count of sale. Forest-3 tries to make the prediction in one single step.

In [None]:
forest_1 = forest_regressor(z_train, p_train)

In [None]:
forest_2 = forest_regressor(x_train, y_train)

In [None]:
forest_3 = forest_regressor(z_train, y_train)

In [None]:
predict_valid = z_valid.copy()
predict_valid['predicted_price'] = forest_1.predict(z_valid)
predict_valid['predicted_cnt_1'] = forest_2.predict(predict_valid)
predict_valid['predicted_cnt_2'] = forest_3.predict(z_valid)
predict_valid['actual_price'] = p_valid
predict_valid['actual_cnt'] = y_valid
predict_valid.head(100)

# The Final Output

## In shape to make Predictions

Just adding in the extra columns, date-block and item-category from the other files so that our Model can make it's predictions

In [None]:
print(input_test.columns)
print(z_train.columns)

In [None]:
i_test, x_test = input_test['ID'], input_test[[col for col in input_test.columns if 'ID' not in col]]
x_test.insert(0, 'date_block_num', input_train['date_block_num'].max() + 1)
x_test.insert(3, 'item_category_id', input_test.apply(lambda x: temp_categories[x.loc['item_id']], axis=1))
x_test.head()

## Making the PREDICTIONS!!!

For now, I am just outputting the **Random Forest Classifier** predictions. Will update to a better model in the future versions of the notebook.

In [None]:
x_test['price'] = forest_1.predict(x_test)
x_test['cnt'] = forest_2.predict(x_test)
x_test.insert(0, 'ID', x_test.index)

In [None]:
x_test.head()

## Get the Column and Print to File

Output one final `submission.csv`. Job Complete!

In [None]:
with open('submission.csv', 'w') as file:
    file.write('ID,item_cnt_month' + '\n')
    for index, item in x_test.iterrows():
        file.write(str(int(item['ID'])) + ',' + str(item['cnt']) + '\n')

In [None]:
with open('submission.csv', 'r') as file:
    for i in range(100):
        print(next(file), end='')