In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.preprocessing import StandardScaler
import datetime

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
df_test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
items_cat = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')

In [None]:
print(df_train.info())
df_test.info()

In [None]:
df_train['date'] = [datetime.date(int(x.split('.')[2]),int(x.split('.')[1]), int(x.split('.')[0])) for x in df_train['date']]
df_train['date']

In [None]:
# analysis
analysis = df_train.copy()

## day sales
analysis['day_sales'] = [int(p*q) for p, q in zip(analysis.item_price, analysis.item_cnt_day)]

## month sales
month_sales = np.array(analysis[['date_block_num', 'day_sales']].groupby(['date_block_num']).sum()['day_sales'])
analysis['month_sales'] = month_sales[analysis['date_block_num']]
analysis.head()

## item name and category
name_id = {x.at['item_id']: (x.at['item_name'], x.at['item_category_id']) for ix, x in items.iterrows()}
analysis['item_name'] = [name_id[x][0] for x in analysis['item_id']]
analysis['item_category_id'] = [name_id[x][1] for x in analysis['item_id']]
cat_id = {x.at['item_category_id']: x.at['item_category_name'] for ix, x in items_cat.iterrows()}
analysis['item_category'] = [cat_id[x] for x in analysis['item_category_id']]


In [None]:
## shop names
shop_name = {x.at['shop_id']: x.at['shop_name'] for ix, x in shops.iterrows()}
analysis['shop_name'] = [shop_name[x] for x in analysis['shop_id']]

In [None]:
grp = ['shop_id', 'item_id', 'date_block_num']
analysis['shop_item_cnt_month'] = analysis[['date_block_num', 'item_cnt_day', 'item_id', 'shop_id']].groupby(grp).transform('sum')
analysis.head()

In [None]:
sns.set_style("ticks",
             {"xtick.major.size":8,
             "ytick.major.size":8})

In [None]:
analysis.head()

In [None]:
plt.figure(figsize=(15,6))
ax = plt.subplot()
monthly_sales = sns.barplot(x=analysis.date_block_num, y=analysis.month_sales, ax = ax)
monthly_sales.set_title("Monthly sales (RUB)")
monthly_sales.set_yticks(np.arange(0, analysis.month_sales.max() + 1, analysis.month_sales.max()/4))
monthly_sales.set(ylabel = "(Million)")

ms_line = sns.lineplot(x=analysis.date_block_num , y=analysis.month_sales, ax = ax, color='green')

In [None]:
plt.figure(figsize=(15,6))
russianchristmas = analysis.loc[analysis["date_block_num"].isin([23, 24])].groupby(['date'])['day_sales'].sum().reset_index(drop=False)
#x = [x[:2] for x in dec2015.date]
ruschr = sns.lineplot(x=[f"{x.day}-{x.month}" for x in russianchristmas.date], y=russianchristmas.day_sales, color = 'green')
ruschr.set_xticks(ruschr.get_xticks()[::2])
plt.xticks(rotation=60)
plt.show()

In [None]:
most_popular_items = analysis.item_name.value_counts()[:10]
plt.xticks(rotation = 90)
mpi = sns.barplot(x=most_popular_items.keys(), y=most_popular_items.values)

In [None]:
most_popular_shops = analysis.shop_name.value_counts()[:10]
best_shop_df = analysis.loc[analysis.shop_name == most_popular_shops.keys()[0]]
plt.xticks(rotation = 90)
mps = sns.barplot(x=most_popular_shops.keys(), y=most_popular_shops.values)

In [None]:
most_popular_cats = analysis.item_category.value_counts()[:10]
plt.xticks(rotation = 90)
mpc = sns.barplot(x=most_popular_cats.keys(), y=most_popular_cats.values)

In [None]:
month_top3_cats = []
for month in range(0, 34):
    month_df = analysis.loc[analysis["date_block_num"] == month]
    month_top3_cats.append(tuple(month_df.item_category.value_counts().keys()[:3]))

month_top3_cats

In [None]:
train_grp = pd.DataFrame(train.groupby(['shop_id', 'item_id', 'date_block_num']).sum().reset_index(drop=False))
train_past_novs = train_grp.loc[(train_grp['date_block_num'].isin([10, 22]))]
train_last_trends = train_grp.loc[(train_grp['date_block_num'].isin([31, 32, 33]))]
train = train_past_novs[['shop_id', 'item_id', 'item_cnt_day']].append(train_last_trends[['shop_id', 'item_id', 'item_cnt_day']])

In [None]:
train = train.groupby(['shop_id', 'item_id']).sum().reset_index(drop=False)

In [None]:
train = train.rename(columns={'item_cnt_day': "item_cnt_month"})

In [None]:
#stc = StandardScaler()
#train["item_cnt_month"] = stc.fit_transform(np.array(train["item_cnt_month"]).reshape(-1, 1))

In [None]:
X = train.drop(columns=['item_cnt_month'])
y = train['item_cnt_month']

In [None]:
X

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import KFold

In [None]:
sgd = SGDRegressor()
kf = KFold(shuffle = True, random_state = 0)
outcomes = []
kf.get_n_splits(X)
for train_idx, test_idx in kf.split(X):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
    sgd.fit(X_train, y_train)
    outcomes.append(sgd.score(X_test, y_test))

outcomes
    

In [None]:
sgd = sgd.fit(X, y)
pred = sgd.predict(df_test.drop(columns=["ID"]))


In [None]:
res = pd.DataFrame({"ID":df_test["ID"], "item_cnt_month": [round(float(str(x)[:2])) for x in pred]})
res

In [None]:
res.to_csv('submission.csv', index=False)

In [None]:
! 