In [None]:
import pandas as pd
import numpy as np

# Data Preparation

In [None]:
# load dataset
articles = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
customers = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [None]:
transactions.info()

In [None]:
transactions.head()

In [None]:
transactions['t_dat'].max()

In [None]:
# 10000 samples of tansactions
df_sample = transactions.sample(n=10000)
df_sample.shape

In [None]:
# one month sample of data
# df_sample = transactions[transactions['t_dat'] > '2020-09-10']

# df_sample.shape

In [None]:
#join customer data
df_sample = pd.merge(df_sample, customers, on='customer_id')

#join article data
# df_sample =pd.merge(df_sample, articles, on='article_id')

In [None]:
df_sample.head()

In [None]:
# Count Na in %
df_sample.isnull().sum()/df_sample.isnull().count()*100

In [None]:
print(df_sample['FN'].unique())
print(df_sample['Active'].unique())

In [None]:
# fill nan in FN and Active columns with 0
df_sample['FN'] = df_sample['FN'].fillna(0)
df_sample['Active'] = df_sample['Active'].fillna(0)

df_sample.isnull().sum()/df_sample.isnull().count()*100

In [None]:
#Imputate others with most frequen value 
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')
imputed_df_sample = pd.DataFrame(imputer.fit_transform(df_sample))

imputed_df_sample.columns = df_sample.columns

print(imputed_df_sample.isnull().sum()/imputed_df_sample.isnull().count()*100)

In [None]:
# Object data to category
for col in imputed_df_sample.select_dtypes(include=['object']).columns:
    imputed_df_sample[col] = imputed_df_sample[col].astype('category')

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

object_cols = imputed_df_sample.select_dtypes(include=['category']).columns

# Encoding categorical features
# for col in imputed_df_sample.select_dtypes(include=['category']).columns:


imputed_df_sample[object_cols] = ordinal_encoder.fit_transform(imputed_df_sample[object_cols])
    
    
# t_dat to datetime objecct
imputed_df_sample['t_dat'] = pd.to_datetime(imputed_df_sample['t_dat'])
    
imputed_df_sample.info()

In [None]:
imputed_df_sample

In [None]:
# drop price to prevent data leakage
imputed_df_sample = imputed_df_sample.drop(['price'], axis=1)

In [None]:
# plot correlation matrix 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

plt.figure(figsize=[7,5])
sns.heatmap(imputed_df_sample.corr())
plt.show()

# Random Forest

In [None]:
from sklearn.model_selection import train_test_split

# select target and features
y = imputed_df_sample['article_id']
selected_columns = ['sales_channel_id', 'fashion_news_frequency' , 'postal_code']

# spilt train and test data
X_train, X_valid, y_train, y_valid = train_test_split(imputed_df_sample[selected_columns], y, test_size = 0.3)

In [None]:
#create model
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=150, n_jobs=1, max_depth=7)


In [None]:
model.fit(X_train, y_train)

In [None]:
# Predicting
predict_labels = model.predict(X_valid)
print(predict_labels)

In [None]:
#evaluate
from sklearn.metrics import mean_absolute_error
mean_absolute_error(predict_labels, y_valid)