In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from itertools import product
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from sklearn.feature_extraction.text import TfidfVectorizer
pd.set_option('display.max_rows', 231)
pd.set_option('display.max_columns', 100)

In [None]:
sales = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv',parse_dates=True)
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
items = pd.read_csv('../input/predict-future-sales-eng-translation/items.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
categories = pd.read_csv('../input/predict-future-sales-eng-translation/categories.csv')

## Seperating item names into categories and sub-categories

In [None]:
le = LabelEncoder()
main_categories = categories.category_name.str.split('-')
categories['main_category_id'] = main_categories.map(lambda x: x[0].strip())
categories['main_category_id'] = le.fit_transform(categories['main_category_id']).astype('int16')

categories['sub_category_id'] = main_categories.map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
categories['sub_category_id'] = le.fit_transform(categories['sub_category_id']).astype('int16')

## Removing Outliers

In [None]:
sales = sales.loc[sales.item_price < 100000]
sales = sales.loc[sales.item_cnt_day<1001]
# Remove duplicate shops

# Якутск Орджоникидзе, 56
sales.loc[sales.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
sales.loc[sales.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
sales.loc[sales.shop_id == 10, 'shop_id'] = 11
# РостовНаДону ТРК "Мегацентр Горизонт
sales.loc[sales.shop_id == 40, 'shop_id'] = 39

In [None]:
shops = shops.loc[~shops.shop_id.isin([0,1,10,40])]

In [None]:
item_prices = sales.groupby('item_id').mean().item_price

first_month_sales = sales.groupby(['shop_id', 'item_id']).agg({'date_block_num':'min'}).rename(columns={'date_block_num':'month_of_first_sale'})
first_month_sales['month_of_first_sale'] = first_month_sales['month_of_first_sale'].astype('int16')

## Grouping by month

In [None]:
sales['revenue'] = sales['item_cnt_day']*sales['item_price']
sales['item_cnt_mean'] = sales['item_cnt_day']
sales = (sales
     .groupby(['date_block_num', 'shop_id', 'item_id'])
     .agg({
         'item_cnt_day':'sum', 
         'revenue':'sum',
         'item_cnt_mean':'mean',
     })
     .reset_index()
     .rename(columns={'item_cnt_day':'item_cnt'})
)

In [None]:
print('shops in test but not in train: ', set(test.shop_id.unique())-set(sales.shop_id.unique()))
print('shops in train but not in test: ', set(sales.shop_id.unique())- set(test.shop_id.unique()))

# Feature Creation

In [None]:
def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

sales = downcast_dtypes(sales)
test = downcast_dtypes(test)
items = downcast_dtypes(items)



In [None]:
df = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    df.append(np.array(list(product(cur_shops, cur_items, [block_num]))))

df = pd.DataFrame(np.vstack(df), columns=['shop_id', 'item_id', 'date_block_num'])
df.head(10)

test['date_block_num'] = 34
del test['ID']
df = pd.concat([df,test]).fillna(0)
df = df.reset_index()
del df['index']


In [None]:
df = pd.merge(df, sales, on=['shop_id', 'item_id', 'date_block_num'], how='left').fillna(0)
df = pd.merge(df, items.drop(columns=['item_name']), on='item_id', how='left')
df = pd.merge(df, categories.drop(columns=['category_name']),on='category_id',how='left')
#df['mean_item_price'] = df['item_id'].map(item_prices)
#df = pd.merge(df, first_month_sales, on=['shop_id', 'item_id'], how='left')
del categories

## Advanced Feature: Using k means to cluster the shops 
* 7 clusters was found to give the best results in initial tests
* However ultimately in the final model, this feature decreased performance on the public leaderboard so was not included

#clustering shops
shops_cats = pd.DataFrame(
    np.array(list(product(*[df['shop_id'].unique(), df['category_id'].unique()]))),
    columns =['shop_id', 'category_id']
)
temp = df.groupby(['category_id', 'shop_id']).agg({'item_cnt':'sum'}).reset_index()
temp2 = temp.groupby('shop_id').agg({'item_cnt':'sum'}).rename(columns={'item_cnt':'shop_total'})
temp = temp.join(temp2, on='shop_id')
temp['category_proportion'] = temp['item_cnt']/temp['shop_total']
temp = temp[['shop_id', 'category_id', 'category_proportion']]
shops_cats = pd.merge(shops_cats, temp, on=['shop_id','category_id'], how='left')
shops_cats = shops_cats.fillna(0)

shops_cats = shops_cats.pivot(index='shop_id', columns=['category_id'])
kmeans = KMeans(n_clusters=7, random_state=0).fit(shops_cats)
shops_cats['shop_cluster'] = kmeans.labels_.astype('int8')

#adding these clusters to the shops dataframe
shops = shops.join(shops_cats['shop_cluster'], on='shop_id')

In [None]:
df = pd.merge(df, shops.drop('shop_name', axis=1), on='shop_id', how='left')

df['month_of_first_sale'].fillna(-1, inplace=True)
df['month_of_first_sale'] = df['month_of_first_sale'].astype('int16')

## TFIDF Features
* Looking for common words in the item name column 
* Choose top 25 most common words

In [None]:
items_subset = items[['item_id', 'item_name']]
feature_count = 25
tfidf = TfidfVectorizer(max_features=feature_count)
items_df_item_name_text_features = pd.DataFrame(tfidf.fit_transform(items_subset['item_name']).toarray())

cols = items_df_item_name_text_features.columns
for i in range(feature_count):
    feature_name = 'item_name_tfidf_' + str(i)
    items_subset[feature_name] = items_df_item_name_text_features[cols[i]]


In [None]:
items_subset.drop('item_name', axis = 1, inplace = True)
df = df.merge(items_subset, on = 'item_id', how = 'left')
df.head()

## Adding Lagged Features

* Total item count lag

In [None]:
def add_lag(df, month):
    temp = df[['shop_id','item_id','date_block_num','item_cnt']]
    temp['date_block_num'] = temp['date_block_num'] + month
    temp = temp.rename(columns={'item_cnt':f'cnt_lag_{month}'})
    df = pd.merge(df, temp, how='left', on=['shop_id', 'item_id', 'date_block_num'])
    return df.fillna(0)

df = add_lag(df, 1)
df = add_lag(df, 2)
df = add_lag(df, 3)

* Mean item count per month lag

In [None]:
def add_lag_mean(df, month):
    temp = df[['shop_id','item_id','date_block_num','item_cnt_mean']]
    temp['date_block_num'] = temp['date_block_num'] + month
    temp = temp.rename(columns={'item_cnt_mean':f'cnt_mean_lag_{month}'})
    df = pd.merge(df, temp, how='left', on=['shop_id', 'item_id', 'date_block_num'])
    return df.fillna(0)

df = add_lag_mean(df, 1)
df = add_lag_mean(df, 2)
df = add_lag_mean(df, 3)

## Extracting city names from shop names 
* This led to a performance decrease so was removed in the final version

shops['shop_name'] = shops['shop_name'].apply(lambda x: x.lower()).str.replace('[^\w\s]', '').str.replace('\d+','').str.strip() 
shops['shop_city'] = shops['shop_name'].str.partition(' ')[0]


df['shop_city'] = df['shop_id'].map(shops.shop_city)
Le = LabelEncoder()
df['shop_city'] = Le.fit_transform(df['shop_city'])

# Mean Encodings
Mean encodings were implemented however they were not found to produce performance improvements and thus removed in the final version. 

# Create training and test sets

In [None]:
train = df.loc[(df['date_block_num'] > 3) & (df['date_block_num'] < 33)]
validation = df.loc[df['date_block_num'] == 33]
test = df.loc[df['date_block_num'] == 34]

In [None]:
y_train = np.clip(train['item_cnt'],0,20)
y_valid = np.clip(validation['item_cnt'],0,20)

X_train = train.drop(['item_cnt','date_block_num','revenue','item_cnt_mean',],axis=1)
X_valid = validation.drop(['item_cnt','date_block_num','revenue','item_cnt_mean'],axis=1)
X_test = test.drop(['item_cnt','date_block_num','revenue','item_cnt_mean'],axis=1)

In [None]:
del df
del train
del validation
del test 

In [None]:
cat_features = ['shop_id']

# Predictions 

In [None]:
params = {'objective':'rmse','metric':'rmse', 'num_leaves': 31,
    'min_data_in_leaf':10,
    'feature_fraction':1,
    'learning_rate': 0.05,
    'num_rounds': 1000,
    'early_stopping_rounds': 10,
    'seed': 1}
clf_lgb = lgb.LGBMRegressor(**params)
cat_features = ['shop_id']
clf_lgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='rmse', categorical_feature=cat_features)

In [None]:
clf_lgb.booster_.save_model('lgb_model.txt')

In [None]:
lgb_preds = clf_lgb.predict(X_valid)
print(np.sqrt(mean_squared_error(y_valid, lgb_preds)))

In [None]:
predictions = clf_lgb.predict(X_test)
sample_submission = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')
sample_submission['item_cnt_month'] = predictions
sample_submission['item_cnt_month'].clip(upper=20,lower=0,inplace=True)
sample_submission.to_csv('lgb.csv',index=False)

In [None]:
from catboost import CatBoostRegressor
cb = CatBoostRegressor()

cb.load_model('../input/catboost-model/catboost_model.txt')

In [None]:
cb_preds = cb.predict(X_valid)

In [None]:
alphas = np.linspace(0.2,0.8,13)
results = []
for alpha in alphas:
    final_preds = cb_preds*(1-alpha) + lgb_preds*alpha
    results.append(np.sqrt(mean_squared_error(y_valid, final_preds)))

In [None]:
plt.plot(alphas,results)

In [None]:
alpha = 0.4
final_preds = cb.predict(X_test)*(1-alpha) + clf_lgb.predict(X_test)*alpha
sample_submission = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')
sample_submission['item_cnt_month'] = final_preds
sample_submission['item_cnt_month'].clip(upper=20,lower=0,inplace=True)
sample_submission.to_csv('final.csv',index=False)


# Final Submission 

Despite ensembling showing a decrease in loss on the validation dataset this did not lead to an improvement in public leaderboard position possibly due to overfitting. Consequently, the final submission was produced using the light gbm model with the optimized hyperparameters given above. 