In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings as w
import gc

w.filterwarnings(action='ignore')
plt.style.use('ggplot')
pd.set_option('display.max_column',None)

In [None]:
customers = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/customers.csv',
                        low_memory=True,
                        usecols={'customer_id':str,
                        'FN':np.uint8,
                        'Active':np.uint8,
                        'club_member_status':str,
                        'fashion_news_frequency':str,
                        'age':np.uint8,
                        'postal_code':str,
                    })
article = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/articles.csv',
                     low_memory=True,
                      usecols={'article_id':np.uint8,
                      'product_code':np.uint8,
                      'product_type_no':np.uint8,
                      'graphical_appearance_no':np.uint8,
                      'colour_group_code':np.uint8,
                      'perceived_colour_value_id':np.uint8,
                      'perceived_colour_master_id':np.uint8,
                      'department_no':np.uint8,
                      'index_code':np.uint8,
                      'index_group_no':np.uint8,
                      'section_no':np.uint8,
                      'garment_group_no':np.uint8
                         })
# transaction file is too big so 10000000 data
transaction = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv',
                         low_memory=True,
                          usecols={
                          't_dat':str,
                          'customer_id':str,
                          'article_id':np.uint8,
                          'price':np.float16,
                          'sales_channel_id':np.uint8
                         },nrows=15000000)
print('Customer:',customers.shape)
print('article:',article.shape)
print('transaction',transaction.shape)
gc.collect()

## Feature explain
 * Customers(User Field)
     1. customer_id : personal id
     2. FN : Fashion News?
     3. Active : ?
     4. club_member_status : iterally
     5. fashion_news_frequency: iterally
     6. age 
     7. post_code : store adress id? customer bought in postal_code

In [None]:
customers.head()

In [None]:
customers.info()

In [None]:
customers.isna().sum()

In [None]:
customers.FN.fillna(0,inplace=True)

In [None]:
customers.FN.value_counts().plot(kind='bar')

In [None]:
customers.Active.fillna(0,inplace=True)

In [None]:
customers.Active.value_counts().plot(kind='bar')

In [None]:
customers.club_member_status.value_counts().plot(kind='bar')
print(customers.club_member_status.value_counts())

### I think won't Data handling with this columns(Drop Age and postal_code)

In [None]:
customers.dropna(inplace=True)

### It looks maybe 40,000 data drop

In [None]:
customers.isna().sum()
print(customers.shape)

## Feature Explain
   * Article (item Field)
      1. article_id 
      2. product_code : item category
      3. prod_name : product name
      4. product_type_no :  
      5. product_group_name : product category name
      6. graphical_appearance_no 
      7. graphical_appearance_name : product pattern style?
      8. colour_group_code
      9. color_group_name : product color
      10. preceived_colour_value_id
      11. perceived_colour_value_name : product detail color
      12. ETC....
      

In [None]:
article.head()

In [None]:
article.info()

In [None]:
article.isna().sum()

In [None]:
article.dropna(inplace=True)

### Feature Explain
   * transaction
      1. sales_channel_id : 1 is offline, 2 is online 

In [None]:
transaction.head()

In [None]:
transaction.sales_channel_id.value_counts(normalize=True).plot(kind='bar')

In [None]:
user_field = customers.merge(transaction,on='customer_id',how='inner')

In [None]:
del transaction,customers
gc.collect()

In [None]:
article.head()

In [None]:
user_field.head()

### merge User_field & item_field

In [None]:
df = article.merge(user_field,on='article_id',how='inner')
df.head()

In [None]:
df.drop_duplicates('customer_id',inplace=True)

In [None]:
del article
gc.collect()

In [None]:
df.sort_values('FN',inplace=True)
df

### user_field 
   * customer_id,FN	Active
   * club_member_status
   * fashion_news_frequency
   * age
   * postal_code
   * t_dat
   * price
   * sales_channel_id

### item_field
   * product_code
   * product_type_no
   * graphical_appearance_no
   * colour_group_code
   * perceived_colour_value_id
   * perceived_colour_master_id
   * department_no
   * index_code
   * index_group_no
   * section_no
   * garment_group_no

In [None]:
df.drop(['t_dat'],axis=1,inplace=True)

In [None]:
dense_features = df.iloc[::,1:11].columns.values.tolist()
sparse_features = df.iloc[::,12:].columns.values.tolist()
sparse_features.pop(1)
sparse_features.append('article_id')
target = ['FN']
print(dense_features)
print('-'*58)
print(sparse_features)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
df.index_code = encoder.fit_transform(df.index_code)
df.club_member_status = encoder.fit_transform(df.club_member_status)
df.fashion_news_frequency = encoder.fit_transform(df.fashion_news_frequency)
df.customer_id = encoder.fit_transform(df.customer_id)
df.postal_code = encoder.fit_transform(df.postal_code)
df.article_id = encoder.fit_transform(df.article_id)

In [None]:
corr_data = df[df.keys()]
cmap = plt.cm.PuBu
cols = corr_data.corr().nlargest(len(df.keys()),'FN')['FN'].index
cm = np.corrcoef(corr_data[cols].values.T)
f,ax = plt.subplots(figsize=(15,10))
sns.heatmap(cm,vmax=1,vmin=-1,annot=True,square=True,linewidths=0.1,cmap=cmap,
            xticklabels=cols.values,yticklabels=cols.values)

In [None]:
del corr_data,cm,cols
gc.collect()

In [None]:
!pip install deepctr_torch

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

In [None]:
fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique())
                              for feat in sparse_features] + [DenseFeat(feat, 1, )
                                                              for feat in dense_features]

In [None]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

In [None]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [None]:
train, test = train_test_split(df,test_size=0.2,random_state=2020)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [None]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model = DeepFM(linear_feature_columns=linear_feature_columns,dnn_feature_columns=dnn_feature_columns,
              task='binary',device=device)

In [None]:
model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                   task='binary',
                   l2_reg_embedding=1e-5, device=device)

model.compile("adagrad", "binary_crossentropy",
              metrics=["binary_crossentropy", "auc"], )

history = model.fit(train_model_input, train[target].values, batch_size=512, epochs=50, verbose=1,
                    validation_split=0.2)
pred_ans = model.predict(test_model_input, 256)
print("")
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))