In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/eng-translations'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
categories = pd.read_csv("../input/eng-translations/categories_eng.csv")
items = pd.read_csv("../input/eng-translations/items_eng.csv")
sales = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
shops = pd.read_csv("../input/eng-translations/shops_eng.csv")
submission = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")

In [None]:
def downcast1(df, verbose=True):
    
    """
    Funciton to reduce the memory used of a particular dataframe by downcasting to a less memory-intensive data type.
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print('{:.1f}% compressed'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
all_df = [sales, shops, items, categories, test]
for df in all_df:
    df = downcast1(df)

# Data Preparation and Cleaning

## Shops

In [None]:
shops.sample(10)

In [None]:
import re

def cleans(i):
    
    """
    Function to clean strings, removing non-alphanumeric characters.
    """
    
    pattern = r'[A-Za-z0-9]+'
    
    finds = re.findall(pattern, str(i))

    stringy = ""
    
    for j in finds:
        
        stringy += f" {j}"
        
    return stringy

In [None]:
shops["clean"] = shops["shop_name"].apply(cleans)
shops.head()

In [None]:
# Deal with obsolete shop_ids

sales.loc[sales["shop_id"]==0, "shop_id"] = 57
sales.loc[sales["shop_id"]==1, "shop_id"] = 58
sales.loc[sales["shop_id"]==10, "shop_id"] = 11
sales.loc[sales["shop_id"]==39, "shop_id"] = 40

test.loc[test['shop_id'] == 0, 'shop_id'] = 57
test.loc[test['shop_id'] == 1, 'shop_id'] = 58
test.loc[test['shop_id'] == 10, 'shop_id'] = 11
test.loc[test['shop_id'] == 39, 'shop_id'] = 40

In [None]:
# Only use shops in train data that are in test data

unique_test_shops = test["shop_id"].unique()
sales = sales[sales["shop_id"].isin(unique_test_shops)]

print(f"Number of Unique Shops in Test Data:{len(unique_test_shops)}\nNumber of Unique Shops in Sales Data:{len(sales['shop_id'].unique())}")

In [None]:
shops.drop("shop_name", axis=1, inplace=True)

In [None]:
shops["city"] = shops["clean"].apply(lambda x: x.split()[0])

In [None]:
# Use LabelEncoder to convert categorical variables into numerical variables

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

shops["city"] = le.fit_transform(shops["city"])
shops.drop("clean", axis=1, inplace=True)


In [None]:
# Final shops dataframe
shops.sample(10)

## Items

In [None]:
items["item_name"] = items["item_name"].str.lower()
items["item_name_clean"] = items["item_name"].apply(cleans)
items.drop("item_name", axis=1, inplace=True)

In [None]:
# Take the first five characters of the item_name string

items["item_name_five"] = [x[:5] for x in items["item_name_clean"]]
items["item_name_five"] = le.fit_transform(items["item_name_five"])
items.drop("item_name_clean", axis=1, inplace=True)

In [None]:
# Create first_sale_date feature

items["first_sale_date"] = sales.groupby("item_id").agg({"date_block_num":"min"})["date_block_num"]
items

In [None]:
# As the NaN values in this table are for items first sold in the test period, replace them with 34 (the date_block_num for the test period)

items[items["first_sale_date"].isna()]
items["first_sale_date"] = items["first_sale_date"].fillna(34)

## Categories

In [None]:
categories["category"] = categories["category_name"].apply(lambda x: x.split()[0])
categories

In [None]:
categories["category"].value_counts()

In [None]:
# Cleaning 

categories.loc[categories["category"] == "Game"] = "Games"

In [None]:
def make_misc(x):
    
    """
    Function to change the name of low frequency categories to 'Misc'
    """
    
    if len(categories[categories['category']==x]) >= 5:
        return x
    else:
        return 'Misc'
    
categories["cats"] = categories["category"].apply(make_misc)

categories

In [None]:
categories.drop(["category", "category_name"], axis=1, inplace=True)

In [None]:
# Encode the 'cats' feature and delete

categories["cats_le"] = le.fit_transform(categories["cats"])

categories.drop("cats", inplace=True, axis=1)

## Remove Outliers from the Sales Dataframe

In [None]:
sales = sales[sales["item_price"] > 0]
sales = sales[sales["item_price"] < 50000]
sales = sales[sales["item_cnt_day"] > 0]
sales = sales[sales["item_cnt_day"] < 1000]
sales["item_price"] = sales["item_price"].apply(lambda x: round(x,2))
sales

## Data Combinations

In [None]:
# Create a dataframe of the Cartesian Product of the unique shops and unique items for each month

from itertools import product

train = []

for i in range(0,34):
    
    cur_shops = sales.loc[sales["date_block_num"] == i, "shop_id"].unique()
    
    cur_items = sales.loc[sales["date_block_num"] == i, "item_id"].unique()
    
    train.append(np.array(list(product(*[[i],cur_shops, cur_items]))))
    
index_feats = ["date_block_num", "shop_id", "item_id"]

train = pd.DataFrame(np.vstack(train), columns=index_feats)
    

In [None]:
# Create the column showing how many of each item have been sold in each month. This is the form the target variable will take.

group = sales.groupby(index_feats).agg({"item_cnt_day": "sum"})
group = group.reset_index()
group = group.rename(columns={"item_cnt_day": "item_cnt_month"})

train = pd.merge(train, group, on=index_feats, how="left")
train

In [None]:
# Use garbage collection to minimise memory usage

import gc

del group

gc.collect()

In [None]:
# Add column for count of items sold.

group = sales.groupby(index_feats).agg({"item_cnt_day":"count"})
group = group.reset_index()
group = group.rename(columns={"item_cnt_day":"item_cnt"})

train = pd.merge(train, group, on=index_feats, how="left")

train.sample(5)

In [None]:
del group, sales
gc.collect()

## Add Test Data to Overall Dataframe

In [None]:
test["date_block_num"] = 34

all_data = pd.concat([train, test.drop("ID", axis=1)], ignore_index=True, keys=index_feats)

all_data = all_data.fillna(0)

all_data.sample(10)

In [None]:
# Merge all dataframes 

all_data = pd.merge(all_data, shops, on="shop_id", how="left")
all_data = pd.merge(all_data, items, on="item_id", how="left")
all_data = pd.merge(all_data, categories, on="category_id", how="left")

all_data.sample(10)

In [None]:
all_data = downcast1(all_data)

del shops, items, categories

gc.collect()

## Create Mean Features

In [None]:
def add_mean_feats(df, mean_feats, index_features, agg_col="item_cnt_month", agg_func="mean"):
    
    """
    Function to automatically create new features showing the mean item_cnt_month grouped by the specified columns.
    """
    
    if len(index_features) == 2:
        feature_name = index_features[1] + f"_{agg_col}_{agg_func}"
    else: 
        feature_name = index_features[1] + "_" + index_features[2] + f"_{agg_col}_{agg_func}"
        
    group = df.groupby(index_features).agg({agg_col:agg_func}).reset_index().rename(columns={agg_col:feature_name})
    
    df = pd.merge(df, group, on=index_features, how="left")
    
    df = downcast1(df)
    
    mean_feats.append(feature_name)
    
    del group
    gc.collect()
    
    return df, mean_feats


In [None]:
item_mean_features = []

all_data, item_mean_features = add_mean_feats(all_data, item_mean_features, ["date_block_num", "item_id"])

all_data

In [None]:
all_data, item_mean_features = add_mean_feats(all_data, item_mean_features, ["date_block_num", "item_id", "city"])

In [None]:
shop_mean_features = []

all_data, shop_mean_features = add_mean_feats(all_data, shop_mean_features, ["date_block_num", "shop_id", "category_id"])

In [None]:
cat_mean_features = []

all_data, cat_mean_features = add_mean_feats(all_data, cat_mean_features, ["date_block_num", "category_id"])

In [None]:
all_data, cat_mean_features = add_mean_feats(all_data, cat_mean_features, ["date_block_num", "cats_le"])

In [None]:
all_data.sample(10)

## Create Lag Features

In [None]:
def add_lags(df, lag_features, index_features, lag_feature, lags=[1,2,3], clip=False):
    
    """
    Function to automatically create lag features based on the columns specified.
    """
    
    df_temp = df[index_features + [lag_feature]].copy()
    
    for i in lags:
        
        feat_name = lag_feature + "_lag" + str(i)
        df_temp.columns = index_features + [feat_name]
        df_temp["date_block_num"] += i
        df = pd.merge(df, df_temp.drop_duplicates(), on=index_features, how="left")
        df[feat_name] = df[feat_name].fillna(0)
        
        if clip:
            lag_feats_to_clip.append(feat_name)
            
    df = downcast1(df)
    del df_temp
    gc.collect()
    
    return df, lag_feats_to_clip

In [None]:
lag_feats_to_clip = []
index_features = ["date_block_num", "shop_id", "item_id"]

all_data, lag_feats_to_clip = add_lags(all_data, lag_feats_to_clip, index_features, "item_cnt_month", clip=True)
all_data, lag_feats_to_clip = add_lags(all_data, lag_feats_to_clip, index_features, "item_cnt", clip=True)

all_data.sample(10)

In [None]:
# Check there is no data that has leaked into test set

X_test_temp = all_data[all_data["date_block_num"]==34]
X_test_temp[item_mean_features].sum()

In [None]:
# Now use the lists that have saved previously in creating the mean features to create additional lags

for item in item_mean_features:
    
    all_data, lag_feats_to_clip = add_lags(all_data, lag_feats_to_clip, index_features, item, clip=True)

In [None]:
for shop in shop_mean_features:
    
    all_data, lag_feats_to_clip = add_lags(all_data, lag_feats_to_clip, ["date_block_num", "shop_id", "category_id"], 
                                           shop, clip=True)
    

In [None]:
for cat in cat_mean_features:
    
    all_data, lag_feats_to_clip = add_lags(all_data, lag_feats_to_clip, ["date_block_num", "category_id"], cat, lags=[1,2,3], clip=True)
    

In [None]:
all_data = all_data.drop(item_mean_features, axis=1)
all_data = all_data.drop(shop_mean_features, axis=1)
all_data = all_data.drop(cat_mean_features, axis=1)

all_data = all_data.drop(all_data[all_data["date_block_num"]<3].index)

In [None]:
all_data.sample(10)

In [None]:
del X_test_temp
gc.collect()

## Additional Features

In [None]:
# Create feature showing mean of the three lags

all_data["item_cnt_month_3lag_mean"] = all_data[["item_cnt_month_lag1", "item_cnt_month_lag2", "item_cnt_month_lag3"]].mean(axis=1)

In [None]:
all_data[lag_feats_to_clip + ["item_cnt_month_3lag_mean", 
                                 "item_cnt_month"]] =  all_data[lag_feats_to_clip + ["item_cnt_month_3lag_mean", 
                                                                                        "item_cnt_month"]].clip(0,20)

In [None]:
# Create lag gradient features

all_data["lag_grad_1"] = all_data["item_cnt_month_lag1"] / all_data["item_cnt_month_lag2"]
all_data["lag_grad_1"] = all_data["lag_grad_1"].replace([np.inf, -np.inf], np.nan).fillna(0)

all_data["lag_grad_2"] = all_data["item_cnt_month_lag2"] / all_data["item_cnt_month_lag3"]
all_data["lag_grad_2"] = all_data["lag_grad_2"].replace([np.inf, -np.inf], np.nan).fillna(0)

In [None]:
all_data["new_items"] = all_data["first_sale_date"] == all_data["date_block_num"]

In [None]:
all_data["time_since_first_sale"] = all_data["date_block_num"] - all_data["first_sale_date"]

all_data.drop("first_sale_date", inplace=True, axis=1)

In [None]:
all_data["month"] = all_data["date_block_num"] % 12

In [None]:
all_data.drop(["item_cnt"], axis=1, inplace=True)

In [None]:
all_data = downcast1(all_data)
all_data.info()

In [None]:
# Change category_id from object datatype to int8

all_data["category_id"] = all_data["category_id"].astype("int8")

## Creating the Machine Learning Models

In [None]:
X_train = all_data[all_data["date_block_num"]<33]
y_train = X_train["item_cnt_month"]
X_train = X_train.drop("item_cnt_month", axis=1)

X_val = all_data[all_data["date_block_num"] == 33]
y_val = X_val["item_cnt_month"]
X_val = X_val.drop("item_cnt_month", axis=1)

X_test = all_data[all_data["date_block_num"]==34]
X_test = X_test.drop("item_cnt_month", axis=1)

del all_data
gc.collect()


In [None]:
def preds(model, test, name):
    
    """
    Function to use the chosen model to make predictions using the chosen test set, format the
    predictions and save these as a .csv file ready for upload to Kaggle.
    """
    
    prediction = model.predict(test)
    
    df_sub = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")
    
    df_sub["item_cnt_month"] = prediction.clip(0,20)
    
    df_sub.to_csv(f"{name}.csv", index=False)
    
    print("Complete.")

In [None]:
# Try Light Gradient Boosting Machine, parameters can be altered for further accuracy.

import lightgbm as lgb

params = {'metric': 'rmse',
          'num_leaves': 255,
          'learning_rate': 0.005,
          'feature_fraction': 0.75,
          'bagging_fraction': 0.75,
          'bagging_freq': 5,
          'force_col_wise' : True,
          'random_state': 10,
         'num_rounds':1500,
         'early_stopping':150}

lgb_train = lgb.Dataset(X_train, y_train)

lgb_val = lgb.Dataset(X_val, y_val)

model = lgb.train(params=params, train_set=lgb_train, valid_sets=(lgb_train, lgb_val), verbose_eval=50)

In [None]:
preds(model, X_test, "lgb_model")

In [None]:
del lgb_train, lgb_val
gc.collect()

In [None]:
def plot_features(booster, figsize):
    
    """
    Function to create a feature importance plot
    """
    
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    return plot_importance(booster=booster, ax=ax)

In [None]:
# Try an Extreme Gradient Boosting model

from xgboost import XGBRegressor, plot_importance
import matplotlib.pyplot as plt


xgb_model = XGBRegressor(
    max_depth=8,
    n_estimators=100,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.1,    
    seed=42)

xgb_model.fit(X_train, y_train, eval_metric="rmse", eval_set=[(X_train, y_train), (X_val, y_val)])

preds(xgb_model, X_test, f"xgb_{i}")

plot_features(xgb_model, (10, 14))