In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout,TimeDistributed,GRU,SimpleRNN
from keras.preprocessing.sequence import TimeseriesGenerator
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

In [None]:
pd.set_option('display.max_columns', 50)

In [None]:
articles_df=pd.read_csv(r'articles.csv')
customers_df=pd.read_csv(r'customers.csv')
transcations_df=pd.read_csv(r'transactions_train.csv')

In [None]:
articles_df.head()

In [None]:
articles_df.info()

In [None]:
customers_df.head()

In [None]:
customers_df.info()

In [None]:
transcations_df.head()

In [None]:
transcations_df.head()

In [None]:
def missing_data(func_df):
    total=func_df.isnull().sum().sort_values(ascending=False)
    percent=(func_df.isnull().sum()/func_df.isnull().count()*100).sort_values(ascending=False)
    return pd.concat([total,percent],axis=1,keys=['Total','Percent'])

In [None]:
missing_data(articles_df)

In [None]:
missing_data(customers_df)

In [None]:
missing_data(transcations_df)

In [None]:
def unique_values(func_df):
    total=func_df.count()
    tt=pd.DataFrame(total)
    tt.columns=['Total']
    uniques=[]
    for col in func_df.columns:
        unique=func_df[col].nunique()
        uniques.append(unique)
    tt['Uniques']=uniques
    return tt

In [None]:
unique_values(articles_df)

In [None]:
unique_values(customers_df)

In [None]:
unique_values(transcations_df)

### Here i am dropping this columns because it has the column which represents in the number format in simple words to reduce multicollinearity
### Eg product_name column has product_name which is same 

In [None]:
cols=['prod_name','product_type_name','graphical_appearance_name','colour_group_name','perceived_colour_value_name',
      'department_name','index_name','index_group_name','section_name','garment_group_name']
articles_df.drop(columns=cols,axis=1,inplace=True)

In [None]:
articles_df.head()

In [None]:
label_encoder = preprocessing.LabelEncoder()
cols=['product_group_name','perceived_colour_master_name','index_code']
for col in cols:
    articles_df[col]= label_encoder.fit_transform(articles_df[col]) 

### I will be dropping detail_desc column 

In [None]:
articles_df.drop(['detail_desc'],axis=1,inplace=True)

In [None]:
articles_df

In [None]:
customers_df.head()

In [None]:
customers_df.shape

In [None]:
customers_df['fashion_news_frequency'].value_counts()

In [None]:
print("The precentage of null value in FN column is {}%".format(round(((customers_df['FN'].isna().sum()/customers_df.shape[0])*100),2)))
print("The precentage of null value in Active column is {}%".format(round(((customers_df['Active'].isna().sum()/customers_df.shape[0])*100),2)))

### I am dropping the FN and Active column because it has more than 65 % null values 

In [None]:
customers_df.drop(['FN','Active'],axis=1,inplace=True)

In [None]:
customers_df['club_member_status'].isna().sum()

In [None]:
customers_df['club_member_status'].fillna('ACTIVE',axis=0,inplace=True)

In [None]:
customers_df['club_member_status'].isna().sum()

In [None]:
customers_df['fashion_news_frequency'].replace(to_replace='NONE',value='None',inplace=True)

In [None]:
customers_df['fashion_news_frequency'].value_counts()

In [None]:
customers_df['postal_code'].nunique()

### Since we has so many unique values in postal_code so  i will be dropping this column

In [None]:
customers_df.drop(['postal_code'],axis=1,inplace=True)

In [None]:
label_encoder = preprocessing.LabelEncoder()
cols=['club_member_status','fashion_news_frequency']
for col in cols:
    customers_df[col]= label_encoder.fit_transform(customers_df[col]) 

In [None]:
customers_df.head()

In [None]:
transcations_df.head()

### There is no null values in transcations 

In [None]:
transcations_df['t_dat']=pd.to_datetime(transcations_df["t_dat"])

In [None]:
transcations_df.tail()

In [None]:
transcations_df.shape

### Here i have selected only the transcations after the year 2020

In [None]:
transcations_df=transcations_df[transcations_df['t_dat']>'2020-01-01'].copy()
transcations_df.shape

In [None]:
transcations_df.head()

In [None]:
inner_merged=pd.merge(transcations_df,articles_df)#inner merge 

In [None]:
inner_merged.shape

In [None]:
inner_merged.head()

In [None]:
final_merged=pd.merge(inner_merged,customers_df)

In [None]:
final_merged.shape

In [None]:
final_merged.head()

### We dont need article_id and customer_id anymore so we will be dropping that too

In [None]:
final_merged.drop(['customer_id','article_id'],axis=1,inplace=True)

In [None]:
# final_merged.to_csv('cleaned_h_m_dataset',encoding='utf-8',index=False)

In [3]:
final_merged=pd.read_csv(r'cleaned_h_m_dataset')

In [4]:
final_merged.head()

Unnamed: 0,t_dat,price,sales_channel_id,product_code,product_type_no,product_group_name,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,perceived_colour_master_name,department_no,index_code,index_group_no,section_no,garment_group_no,club_member_status,fashion_news_frequency,age
0,2020-01-02,0.030492,2,797565,306,16,1010016,42,7,18,13,1338,1,1,61,1017,0,2,25.0
1,2020-01-02,0.030492,2,797565,306,16,1010016,9,4,5,1,1338,1,1,61,1017,0,2,25.0
2,2020-08-14,0.033881,2,751471,272,6,1010016,9,4,5,1,1722,0,1,15,1009,0,2,25.0
3,2020-08-14,0.050831,2,781613,264,7,1010016,9,4,5,1,1222,0,1,15,1008,0,2,25.0
4,2020-04-24,0.025407,2,855823,265,5,1010016,9,4,5,1,1344,3,2,53,1013,0,2,25.0


In [5]:
final_merged.t_dat.max()

'2020-09-22'

In [6]:
df_train=final_merged[(final_merged['t_dat']>'2020-08-28') & (final_merged['t_dat']<'2020-09-20')].copy()
df_test=final_merged[final_merged['t_dat']>='2020-09-20'].copy()

In [7]:
df_train.t_dat.max()

'2020-09-19'

In [8]:
df_train.drop(['t_dat'],axis=1,inplace=True)
df_test.drop(['t_dat'],axis=1,inplace=True)

In [9]:
y_train=df_train['product_code'].tolist()
y_test=df_test['product_code'].tolist()
df_train.drop(['product_code'],axis=1,inplace=True)
df_test.drop(['product_code'],axis=1,inplace=True)
x_train=df_train.to_numpy().tolist()
x_test=df_test.to_numpy().tolist()

In [10]:
y_test

[707269,
 812364,
 814230,
 764073,
 764073,
 761575,
 884220,
 903735,
 855834,
 842028,
 842028,
 870328,
 831302,
 925509,
 786022,
 892555,
 807747,
 778534,
 687347,
 751471,
 751471,
 851400,
 851400,
 895991,
 895991,
 902999,
 685814,
 685814,
 653275,
 910241,
 873884,
 918892,
 863583,
 863583,
 891886,
 790368,
 790368,
 894780,
 874891,
 873279,
 881570,
 448509,
 448509,
 448509,
 781613,
 736049,
 873771,
 898713,
 714790,
 714790,
 923037,
 762796,
 886566,
 902265,
 902265,
 927172,
 909320,
 677930,
 543054,
 687524,
 685044,
 827968,
 827968,
 718278,
 598515,
 782734,
 903487,
 903487,
 861558,
 763842,
 873884,
 873884,
 865939,
 907951,
 872537,
 832307,
 832307,
 547780,
 868823,
 826492,
 903773,
 751288,
 903306,
 855080,
 855080,
 855080,
 855080,
 855080,
 928907,
 928907,
 921697,
 758034,
 685604,
 685604,
 926921,
 926921,
 621381,
 621381,
 803969,
 902388,
 803757,
 857778,
 874754,
 859400,
 896064,
 708138,
 909588,
 857440,
 904022,
 898694,
 695632,
 

In [11]:
train_generator = TimeseriesGenerator(x_train, y_train, length=7, sampling_rate = 1, batch_size = 32)
test_generator = TimeseriesGenerator(x_test, y_test, length=7, sampling_rate = 1, batch_size = 32)

In [12]:
# df_product_code_train=df_train.copy()
# scaler=MinMaxScaler()
# df_train=scaler.fit_transform(df_train)
# df_test=scaler.transform(df_test)
# scaler_pred=MinMaxScaler()
# df_product_code=scaler_pred.fit_transform(df_product_code_train.product_code.values.reshape(-1,1))

In [13]:
df_train.shape

(816929, 17)

In [14]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)
model = keras.models.Sequential([
    keras.layers.GRU(512, input_shape = (7, 17), dropout=0.2, return_sequences=True),
    keras.layers.GRU(512, dropout=0.2, return_sequences=True),
    keras.layers.GRU(512, dropout=0.2, return_sequences=True),
    keras.layers.GRU(512, dropout=0.2, return_sequences=True),
    keras.layers.GRU(512, dropout=0.2, return_sequences=True),
    keras.layers.Dense(1)
])

In [15]:
model.compile(loss="mean_squared_logarithmic_error", optimizer= "adam", metrics=["mae"])
history = model.fit_generator(train_generator, validation_data = test_generator, epochs = 10)

Epoch 1/10


  history = model.fit_generator(train_generator, validation_data = test_generator, epochs = 10)


Epoch 2/10

KeyboardInterrupt: 

In [16]:
predictions = model.predict_generator(test_generator)

  predictions = model.predict_generator(test_generator)
