# Preprocessing

In [1]:
import calendar
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
import pandas as pd
import numpy as np

from constants import CATEGORY_CODES, CATEGORY_COLUMN, DATE_COLUMN, FURTHER_DAYS, TARGET_COLUMN, TOTAL_COLUMNS

In [2]:
def date_format(x):
    return datetime.strptime(x, '%d.%m.%Y')

data = pd.read_csv('sales_train.csv', date_parser=date_format, parse_dates=[0])
data.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,2013-01-02,0,59,22154,999.0,1.0
1,2013-01-03,0,25,2552,899.0,1.0
2,2013-01-05,0,25,2552,899.0,-1.0
3,2013-01-06,0,25,2554,1709.05,1.0
4,2013-01-15,0,25,2555,1099.0,1.0


In [3]:
items = pd.read_csv('items.csv', index_col='item_id')['item_category_id']
categories = pd.read_csv('item_categories.csv', index_col='item_category_id', squeeze=True)
category_names = items.map(categories)

In [4]:
def preprocess_data(df: pd.DataFrame, category_names: pd.Series) -> pd.DataFrame:
    df = df[df[TARGET_COLUMN] > 0]
    df[CATEGORY_COLUMN] = df['item_id'].map(category_names)
    df = df[df[CATEGORY_COLUMN].isin(CATEGORY_CODES.values())]
    df = df.set_index('date').groupby(CATEGORY_COLUMN).resample('D').sum().reset_index()
    to_concat = list()
    for code, category in CATEGORY_CODES.items():
        df_ = df[df[CATEGORY_COLUMN] == category].set_index(DATE_COLUMN)
        print(f'{code} max date: {df_.index.max()}')
        date_index = pd.date_range(
            start=df_.index.min(),
            end=(df_.index.max() + timedelta(days=FURTHER_DAYS)),
            freq='D'
        )
        df_ = df_.reindex(date_index)
        df_[CATEGORY_COLUMN] = category
        df_.reset_index(inplace=True)
        df_ = df_.rename(columns={'index': DATE_COLUMN})
        to_concat.append(df_)
    df = pd.concat(to_concat)
    df['month'] = df['date'].dt.month
    df['month'] = df['date'].dt.day
    df['month_range'] = df['date'].apply(lambda date: calendar.monthrange(date.year, date.month)[1])
    df['month_cos'] = np.cos(df['month'] - 1 / 12 * 2 * np.pi)
    df['month_sin'] = np.sin(df['month'] - 1 / 12 * 2 * np.pi)
    df['day'] = df['date'].apply(lambda date: date.day)
    df['day_cos'] = np.cos(df['day'] - 1 / df['month_range'] * 2 * np.pi)
    df['day_sin'] = np.sin(df['day'] - 1 / df['month_range'] * 2 * np.pi)
    df['day_of_week'] = df['date'].dt.dayofweek
    df['day_of_week_cos'] = np.cos(df['day_of_week'] / 7 * 2 * np.pi)
    df['day_of_week_sin'] = np.sin(df['day_of_week'] / 7 * 2 * np.pi)
    
    return df[TOTAL_COLUMNS]

In [5]:
data = preprocess_data(data, category_names)
data.to_csv('../data.tsv', sep='\t')
data.head()

movies_dvd max date: 2015-10-31 00:00:00
music_cd_local max date: 2015-10-31 00:00:00
games_ps_3 max date: 2015-10-31 00:00:00
games_xbox_360 max date: 2015-10-31 00:00:00
presents_softtoy max date: 2015-10-31 00:00:00
presents_boardgame max date: 2015-10-31 00:00:00


Unnamed: 0,day_cos,day_sin,month_cos,month_sin,day_of_week_cos,day_of_week_sin,category,date,item_cnt_day
0,0.698629,0.715484,0.888651,0.458584,0.62349,0.781831,Кино - DVD,2013-01-01,492.0
1,-0.224588,0.974454,0.094255,0.995548,-0.222521,0.974928,Кино - DVD,2013-01-02,1984.0
2,-0.94132,0.337515,-0.786799,0.61721,-0.900969,0.433884,Кино - DVD,2013-01-03,1802.0
3,-0.792607,-0.609733,-0.944473,-0.328588,-0.900969,-0.433884,Кино - DVD,2013-01-04,1680.0
4,0.084826,-0.996396,-0.233803,-0.972284,-0.222521,-0.974928,Кино - DVD,2013-01-05,1588.0


In [81]:
# import streamlit_authenticator as stauth
# passwords = ['qwerty12345']
# hashed_passwords = stauth.Hasher(passwords).generate()
# print(hashed_passwords)

# import pickle
# with open('hashed_pwd.pickle', 'wb') as f:
#     pickle.dump(hashed_passwords, f)


['$2b$12$n9Kfl0QyUmjlXDhP4MIkt.FpWiVEPvMPSHv5ZHYkZpfOll0zl26/e']
