### DeepFM :https://github.com/shenweichen/DeepCTR-Torch
### Paper : https://www.ijcai.org/proceedings/2017/0239.pdf
 * DeepFM do not need domain knowledge because this model Don't use feature engineering
 * So We run it!

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import gc

plt.style.use('ggplot')

import warnings as w
w.filterwarnings(action='ignore')
pd.set_option('display.max_columns',None)

In [None]:
dtype={'id': np.dtype(int),
    'click': np.dtype(int),
    'hour': np.str,
    'C1': np.dtype(str),
    'banner_pos': np.dtype(str),
    'site_id': np.dtype(str),
    'site_domain': np.dtype(str), 
    'site_category': np.dtype(str),
    'app_id': np.dtype(str),
    'app_domain': np.dtype(str),
    'app_category': np.dtype(str),
    'device_id': np.dtype(str),
    'device_ip': np.dtype(str),
    'device_model': np.dtype(str),
    'device_type': np.dtype(str),
    'device_conn_type': np.dtype(str),
    'C14': np.dtype(str),
    'C15': np.dtype(str),
    'C16': np.dtype(str),
    'C17': np.dtype(str),
    'C18': np.dtype(str),
    'C19': np.dtype(str),
    'C20': np.dtype(str),
    'C21':np.dtype(str)
      }
num_records = 40428967
sample_size = 5000000
skip_values = sorted(random.sample(range(1,num_records), num_records - sample_size))
parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')

In [None]:
train = pd.read_csv("../input/avazu-ctr-prediction/train.gz", parse_dates=['hour'], date_parser=parse_date, dtype=dtype, skiprows=skip_values)
test = pd.read_csv('../input/avazu-ctr-prediction/test.gz')
submission = pd.read_csv('../input/avazu-ctr-prediction/sampleSubmission.gz')
print('Train dataset:',train.shape)
print('Test dataset:',test.shape)
print('Submission:',submission.shape)

### Feature Expalin
  1. id : personal_id
  2. click : target
  3. hour : datetime
  4. C1 : anonymized categorical variable
  5. banner_pos : showing banner
  6. site domain
  7. site_category 
  8. app_id 
  9. add_category
  10. device_id 
  11. device_ip
  12. device_model
  13. device_type
  14. device_conn_type
  15. C14 ~ C21 : anonymized categorical variables

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.isna().sum()

In [None]:
train.click.value_counts().plot(kind='bar')
print(train.click.value_counts())

## Target feature data imbalance

In [None]:
train.banner_pos.value_counts()

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(x='banner_pos',hue='click',data=train)

### Banner pos is not effienct increasing user click  

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(x='hour',hue='click',data=train)

### The date is too long to identify characteristics, so sample data is taken to run a sample survey

In [None]:
ex_data = train[train.hour < '2014-10-22']
ex_data.head()

In [None]:
plt.figure(figsize=(70,20))
sns.countplot(x='hour',hue='click',data=ex_data)

### Clicks don't seem to be affected by time

In [None]:
del ex_data
gc.collect()

In [None]:
train.set_index('hour',inplace=True)

In [None]:
test.set_index('hour',inplace=True)

In [None]:
train.head()

### sparse feature
 * C1
 * banner_pos
 * site_id
 * site_domain
 * site_category
 * app_id
 * app_domain
 * app_category
 * device_id
 * device_ip
 * device_model
 * device_type
 * device_conn_type

### Dense feature
 * C14 ~ c21

In [None]:
!pip install deepctr_torch

In [None]:
import torch
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names
from deepctr_torch.models import *

In [None]:
data = train

In [None]:
sparse_features = train.iloc[::,2:14].columns.values.tolist()
dense_features = train.iloc[::,15:].columns.values.tolist()
sparse_features.append('id')
target = ['click']

In [None]:
test.head()

In [None]:
test_sparse_features = test.iloc[::,0:14].columns.values.tolist()
test_dense_features = test.iloc[::,14:].columns.values.tolist()

In [None]:
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
scaler = MinMaxScaler(feature_range=(0,1))
encoder = LabelEncoder()

In [None]:
for feat in sparse_features:
    data[feat] = encoder.fit_transform(data[feat])
data[dense_features] = scaler.fit_transform(data[dense_features])

In [None]:
for feat in test_sparse_features:
    test[feat] = encoder.fit_transform(test[feat])
test[dense_features] = scaler.fit_transform(test[test_dense_features])

In [None]:
fixlen_feature_columns = [SparseFeat(feat,data[feat].nunique()) 
                          for feat in sparse_features] + [DenseFeat(feat,1,)
                                                         for feat in dense_features]

In [None]:
# fixlen_feature_columns = [SparseFeat(feat,data[feat].nunique()) 
#                           for feat in sparse_features] + [DenseFeat(feat,1,)
#                                                          for feat in dense_features]

In [None]:
fixlen_feature_columns

In [None]:
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

In [None]:
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
feature_names

In [None]:
train_model_input = {name: data[name]for name in feature_names}

In [None]:
test_model_input = {name: test[name] for name in feature_names}

In [None]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
model = DeepFM(linear_feature_columns,dnn_feature_columns,
               task='binary',device=device,dnn_dropout=0.7)
model

In [None]:
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['binary_crossentropy','auc'])

In [None]:
%%time
history = model.fit(train_model_input, train[target].values, batch_size=1024, epochs=10, verbose=1,
                        validation_split=0.2)

In [None]:
pred_ans = model.predict(test_model_input, 1024)

In [None]:
submission['click'] = pred_ans

In [None]:
submission

In [None]:
submission.to_csv('submission.csv',index=False)