In [1]:
import numpy as np
import pandas as pd 
from tqdm import tqdm_notebook
import os
import sys
import datetime
import os.path
from calendar import monthrange

import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rc
from cycler import cycler
%matplotlib inline

from dateutil import rrule
from datetime import date
 
mpl.rcParams['axes.prop_cycle'] = cycler('color', ['#ff0000', '#0000ff',   '#00ffff','#ffA300', '#00ff00', 
     '#ff00ff', '#990000', '#009999', '#999900', '#009900', '#009999'])

rc('font', size=16)
rc('font',**{'family':'serif','serif':['Computer Modern']})
rc('text', usetex=False)
rc('figure', figsize=(12, 10))
rc('axes', linewidth=.5)
rc('lines', linewidth=1.75)

pd.options.mode.chained_assignment = None  # default='warn'

sys.path.append("../src") # Adds directory to python modules path.
from models import utils
from data import distributions

In [2]:
from googletrans import Translator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [3]:
# Load files
items = utils.load_raw_data('items.csv')
shops = utils.load_raw_data('shops.csv')
item_cats = utils.load_raw_data('item_categories.csv')

In [4]:
# Translate to English
translator = Translator()
item_cats['item_category_name_en'] = item_cats['item_category_name'].apply(
    lambda x: translator.translate(x, src='ru').text
)
shops['shop_name_en'] = shops['shop_name'].apply(
    lambda x: translator.translate(x, src='ru').text
)

In [7]:
# Add meta category
meta_cat = item_cats['item_category_name_en'].apply(lambda x: x.split(' - ')[0])
meta_cat.iloc[0] = item_cats['item_category_name_en'].iloc[0].split(' / ')[0]
meta_cat.iloc[25] = item_cats['item_category_name_en'].iloc[25].split(' - ')[1]
meta_cat.iloc[26] = 'Phone Games'
meta_cat.iloc[27] = 'Phone Games'
meta_cat.iloc[32] = 'Payment Cards'
meta_cat.iloc[40] = 'Movies'
meta_cat.iloc[41] = 'Movies'
meta_cat.iloc[81] = 'Clean'
meta_cat.iloc[82] = 'Clean'
item_cats['meta_cat'] = LabelEncoder().fit_transform(meta_cat)

In [9]:
def translate_list(l):
    translator = Translator()
    return [translator.translate(w, src='ru').text for w in l]

In [10]:
def tokenize_doc(doc):
    clean_chars = [',', '(', ')', '[', ']', '"', '!', '.']
    rem_words = set(['-', ''])
    # Remove characters
    for c in clean_chars:
        doc = doc.replace(c, '')
    
    out = [w for w in doc.lower().split() if w not in rem_words]
    # Add bigrams
    bigrams = ['%s %s' % (out[i], out[i + 1]) for i in range(len(out) - 1)]
    
    return set(out) | set(bigrams)
    
    
def get_tfidf(s):    
    vectorizer = TfidfVectorizer(
        norm='l2', tokenizer=tokenize_doc,
        min_df=0.1, max_df=0.5,
        # max_features=5
    )
    response = vectorizer.fit_transform(s.values)
    df = pd.DataFrame(response.toarray())
    df.columns = translate_list(vectorizer.get_feature_names())
    return df


def add_tfidf(df, column):
    tfidf = get_tfidf(df[column])
    prefix = ''.join([i[0] for i in column.split('_')])
    tfidf.columns = ['%s_%s' % (prefix, c) for c in tfidf.columns]
    return df.join(tfidf)

In [11]:
items = add_tfidf(items, 'item_name')
shops = add_tfidf(shops, 'shop_name')
item_cats = add_tfidf(item_cats, 'item_category_name')

In [12]:
items_ext = items.merge(item_cats, on='item_category_id')

## Feature selection

In [13]:
items_ext = utils.downcast_dtypes(items_ext)
shops = utils.downcast_dtypes(shops)

In [14]:
data = utils.load_monthly_data()
data = distributions.fix_train_distribution(data)
print('merging shops')
data = data.merge(shops, on='shop_id')
print('merging items')
data = data.merge(items_ext, on='item_id')
data = utils.downcast_dtypes(data)
data.info()

merging shops
merging items
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10906930 entries, 0 to 10906929
Data columns (total 23 columns):
date_block_num           int32
shop_id                  int32
item_id                  int32
item_cnt_month           float32
shop_name                object
shop_name_en             object
sn_mega                  float32
sn_Moscow                float32
sn_moscow tc             float32
sn_trc                   float32
sn_tsh                   float32
sn_tc mega               float32
item_name                object
item_category_id         int32
in_PC                    float32
in_version               float32
item_category_name       object
item_category_name_en    object
meta_cat                 int32
icn_games                float32
icn_books                float32
icn_gifts                float32
icn_numeral              float32
dtypes: float32(13), int32(5), object(5)
memory usage: 1.2+ GB


In [15]:
rem_cols = [
    'date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'item_category_id',
    'shop_name', 'shop_name_en', 'item_name', 'item_name_en',
    'item_category_name', 'item_category_name_en'
]
cols = [c for c in data.columns if c not in rem_cols]
samp_data = data.sample(2**21)
x_train = samp_data[cols]
y_train = samp_data['item_cnt_month']

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
# Create a random forest classifier
clf = RandomForestRegressor(n_estimators=10, random_state=0)
# Train the classifier
clf.fit(x_train, y_train)
# Print the name and gini importance of each feature
for feature in zip(x_train.columns, clf.feature_importances_):
    print(feature)
    
mse = mean_squared_error(clf.predict(x_train), y_train)
print('RMSE: %.4f' % (mse ** 0.5))
mse = mean_squared_error(np.zeros(y_train.shape), y_train)
print('All zeros RMSE: %.4f' % (mse ** 0.5))

('sn_mega', 0.012185496289381648)
('sn_Moscow', 0.15207052774481017)
('sn_moscow tc', 0.025179021843648834)
('sn_trc', 0.13228441108425501)
('sn_tsh', 0.087075802634659583)
('sn_tc mega', 0.0072733920793189161)
('in_PC', 0.032075496496021635)
('in_version', 0.084729891364759585)
('meta_cat', 0.26893544390704122)
('icn_games', 0.16817464698925494)
('icn_books', 0.0067735790799959656)
('icn_gifts', 0.015752851767275165)
('icn_numeral', 0.0074894387195772391)
RMSE: 3.5497
All zeros RMSE: 3.5999


## Extra features

In [17]:
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
city = shops['shop_name'].str.split(' ').map(lambda x: x[0])
city[city == '!Якутск'] = 'Якутск'
shops['city_code'] = LabelEncoder().fit_transform(city)
to_drop = ['shop_name', 'shop_name_en']
shops.drop(to_drop, axis=1, errors='ignore', inplace=True)

In [18]:
items_ext
cats_split = items_ext['item_category_name'].str.split('-')
cats_type = cats_split.map(lambda x: x[0].strip())
items_ext['type_code'] = LabelEncoder().fit_transform(cats_type)
# if subtype is nan then type
cats_subtype = cats_split.map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
items_ext['subtype_code'] = LabelEncoder().fit_transform(cats_subtype)
items_ext

to_drop = ['item_name', 'item_category_name', 'item_category_name_en']
items_ext.drop(to_drop, axis=1, errors='ignore', inplace=True)

## Save features

In [19]:
items_ext.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22170 entries, 0 to 22169
Data columns (total 11 columns):
item_id             22170 non-null int32
item_category_id    22170 non-null int32
in_PC               22170 non-null float32
in_version          22170 non-null float32
meta_cat            22170 non-null int32
icn_games           22170 non-null float32
icn_books           22170 non-null float32
icn_gifts           22170 non-null float32
icn_numeral         22170 non-null float32
type_code           22170 non-null int64
subtype_code        22170 non-null int64
dtypes: float32(6), int32(3), int64(2)
memory usage: 1.3 MB


In [20]:
shops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 8 columns):
shop_id         60 non-null int32
sn_mega         60 non-null float32
sn_Moscow       60 non-null float32
sn_moscow tc    60 non-null float32
sn_trc          60 non-null float32
sn_tsh          60 non-null float32
sn_tc mega      60 non-null float32
city_code       60 non-null int64
dtypes: float32(6), int32(1), int64(1)
memory usage: 2.2 KB


In [22]:
data_path = '../data/processed/'
shops_columns = ['shop_id', 'sn_Moscow', 'city_code']
items_columns = [
    'item_id', 'item_category_id', 'in_PC', 'in_version', 'icn_games', 
    'type_code', 'subtype_code', 'meta_cat'
]
shops[shops_columns].to_csv(data_path + 'shops.csv', index=False)
items_save = items_ext[items_columns]
items_save.to_csv(data_path + 'items.csv', index=False)