# Импорты

In [None]:
!pip install lightgbm
!pip install xgboost
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor

# Конфиг

In [None]:
pathToDir  = '/kaggle/input/competitive-data-science-predict-future-sales/'

paths = {
    'pathToDate': pathToDir,
    'categories': pathToDir + 'item_categories.csv',
    'items':  pathToDir + 'items.csv',
    'train': pathToDir + 'sales_train.csv',
    'submission': pathToDir + 'sample_submission.csv',
    'shops': pathToDir + 'shops.csv',
    'test': pathToDir + 'test.csv'
}

config = {'paths': paths}

# Утилиты

In [None]:
def displayInCenter(text):
    outputFormated = '{:*^50}'.format(text)
    print(outputFormated)
    
def regexFilter(text, regex):
    if not text:
        return False
    return re.search(regex,text)


# Подгрузка данных

In [None]:

categoriesDf = pd.read_csv(paths.get('categories'))
itemsDf = pd.read_csv(paths.get('items'))
trainDf = pd.read_csv(paths.get('train'))
submissionDf = pd.read_csv(paths.get('submission'))
shopsDf = pd.read_csv(paths.get('shops'))
testDf = pd.read_csv(paths.get('test'))

initialDatasets = {
    'categories': categoriesDf,
    'items': itemsDf,
    'train': trainDf,
    'submission': submissionDf,
    'shops': shopsDf,
    'test': testDf,
}

displayInCenter('Данные подгрузились')

# Просматриваем инфо о датафреймах

In [None]:
for(k,v) in initialDatasets.items():
    displayInCenter(k)
    v.info()

displayInCenter('Информация Выведена')

# Инфо

In [None]:
for(k,v) in initialDatasets.items():
    displayInCenter(k)
    print(v.describe())

### Видим, что цена итема и количество бывают отрицательным, надо бы убрать эти "выхлопы"

# Разбиение колонок

In [None]:
# добавляем город
shopsDf['city'] = shopsDf['shop_name'].apply(lambda x: x.split()[0])

# Беглый просмотр данных

In [None]:
#помотрим есть ли города, со странными символами
regFilterForCity = lambda city: not regexFilter(city, r'^[А-Я]+[А-Я,а-я, ]*$') 
strangeCityDf = shopsDf[shopsDf.city.apply(regFilterForCity)].city
displayInCenter("Странные города")
print(strangeCityDf.unique())
displayInCenter("Все Города")
print(shopsDf.city.unique())
# надо бы заменить !Якутск, вызывет сомнения еще Интернет-магазин

In [None]:
# посмотрим на гистограмму распределения цены
trainDf.item_price
sns.distplot(trainDf.item_price, hist = False, kde = True,
                 kde_kws = {'linewidth': 3})

In [None]:
sns.distplot(trainDf[trainDf['item_price'] > 25000].item_price, hist = True, kde = True,
                 kde_kws = {'linewidth': 3})

#### нужно убрать выхлоп с огромной ценной на итем

Посмотрим на количество продаж 

In [None]:
sns.distplot(trainDf.item_cnt_day, hist = False, kde = True)

In [None]:
sns.distplot(trainDf[trainDf['item_cnt_day'] > 250].item_cnt_day, hist = True, kde = True)

In [None]:
regFilterForName = lambda name: not regexFilter(name, r'^[А-Я]+[А-Я,а-я, "]*$') 
shopsDf[['shop_name', 'shop_id']][shopsDf.shop_name.apply(regFilterForName)]

# совпадает Ораджаникидзе 56,  Чкалова 39 

In [None]:
shopsDf[['shop_name', 'shop_id']][shopsDf.city.str.startswith('!Якутск', na=False) | shopsDf.city.str.startswith('Якутск', na=False)]

также совпадают 1 и 58

# Немного очистки данных

In [None]:
# выводим дубликаты shop_name
displayInCenter("Дубликаты shop_name")
print(shopsDf['shop_name'][0], '==', shopsDf['shop_name'][57])
print(shopsDf['shop_name'][1], '==', shopsDf['shop_name'][58])
print(shopsDf['shop_name'][10], '==', shopsDf['shop_name'][11])

# заменяем дубликаты в тренировке
trainDf.loc[trainDf['shop_id'] == 0, 'shop_id'] = 57
trainDf.loc[trainDf['shop_id'] == 1, 'shop_id'] = 58
trainDf.loc[trainDf['shop_id'] == 10, 'shop_id'] = 11

# заменяем дубликаты в тестовых данных
testDf.loc[testDf['shop_id'] == 0, 'shop_id'] = 57
testDf.loc[testDf['shop_id'] == 1, 'shop_id'] = 58
testDf.loc[testDf['shop_id'] == 10, 'shop_id'] = 11

# заменяем дубликаты в городе
shopsDf.loc[shopsDf['city'] =='!Якутск', 'city'] = 'Якутск'

In [None]:
# обрезаем аномальные данные по цене и по количеству продаж
trainDf = trainDf[(trainDf["item_price"] > 0) & (trainDf["item_price"] < 50000)]
trainDf = trainDf[(trainDf["item_cnt_day"] > 0) & (trainDf["item_cnt_day"] < 1000)]

# Мерджим данные

In [None]:
dfAggregator = pd.merge(trainDf, itemsDf, on = 'item_id', how = 'left')
dfAggregator = pd.merge(dfAggregator, categoriesDf, on ='item_category_id',how ='left')
dfAggregator = pd.merge(dfAggregator, shopsDf, on ='shop_id',how ='left')
dfAggregator['month'] = 1 + dfAggregator['date_block_num']%12
dfAggregator['year'] = 2013 + dfAggregator['date_block_num']// 12
dfAggregator

In [None]:
sns.heatmap(dfAggregator.corr())

In [None]:
features = ["item_id","shop_id","month","year"]

In [None]:
train = dfAggregator[["item_id","shop_id","month","year", "item_cnt_day", "date_block_num"]].groupby(["item_id","shop_id","month","year", "date_block_num"]).sum().reset_index()
train.rename(columns={'item_cnt_day':'item_cnt_month'},inplace=True)
train

In [None]:
testDf["year"] = 2015
testDf["month"]= 11
testDf

In [None]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(train[features],train["item_cnt_month"],test_size=0.2, random_state=0)

# XGBRegressor

In [None]:
xgb = XGBRegressor().fit(train_X,train_y)
xgb.score(val_X,val_y)


In [None]:
from sklearn.metrics import mean_squared_error
y_pred = xgb.predict(train_X)
mean_squared_error(train_y, y_pred)

# LGM

In [None]:
lgb = LGBMRegressor().fit(train_X,train_y)
lgb.score(val_X,val_y)
y_pred = lgb.predict(train_X)
mean_squared_error(train_y, y_pred)

# Linear Regression Model

In [None]:
lrm = LinearRegression().fit(train_X, train_y)
lrm.score(val_X, val_y)

In [None]:
y_pred = lrm.predict(train_X)
mean_squared_error(train_y, y_pred)

# RFR

In [None]:
rfr = RandomForestRegressor(n_estimators=50).fit(train_X,train_y)
rfr.score(val_X,val_y)

In [None]:
y_pred = rfr.predict(train_X)
mean_squared_error(train_y, y_pred)

In [None]:
pred = rfr.predict(testDf[features])
submission = pd.DataFrame({"ID": testDf["ID"],"item_cnt_month" : pred})
submission.to_csv("submission.csv",index=False)