# ML Project Report

# EDA & Feature Engineering

### Import Required Packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

### Loading Training Dataset



In [2]:
path = 'train_dataset.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Sale,SalesAmountInEuro,time_delay_for_conversion,click_timestamp,nb_clicks_1week,product_price,product_age_group,device_type,audience_id,product_gender,...,product_category(3),product_category(4),product_category(5),product_category(6),product_category(7),product_country,product_id,product_title,partner_id,user_id
0,0,-1.0,-1,2020-08-04 04:07:56,-1,0.0,-1,7E56C27BFF0305E788DA55A029EC4988,-1,-1,...,-1,-1,-1,-1,-1,57A1D462A03BD076E029CF9310C11FC5,B69E439E41E0BEAD764ABF16D7FD96C9,-1,E3DDEB04F8AFF944B11943BB57D2F620,5E2C678F6586B67F61A377E1534E01FC
1,0,-1.0,-1,2020-08-04 01:47:40,-1,0.0,-1,7E56C27BFF0305E788DA55A029EC4988,-1,-1,...,-1,-1,-1,-1,-1,57A1D462A03BD076E029CF9310C11FC5,404D3D9D03297504F3509032DCFA02F0,-1,E3DDEB04F8AFF944B11943BB57D2F620,E8247702C0DD294E0AE6B5B5E2F9E810
2,0,-1.0,-1,2020-08-04 16:54:31,-1,0.0,4C90FD52FC53D2C1C205844CB69575AB,FF2C446555E3822B0E0FC3406116E86D,-1,C45A9AC6D102ACAEEDF0D6F78636D84A,...,-1,-1,-1,-1,-1,2AC62132FBCFA093B9426894A4BC6278,B09E51338E0EED59C5A859B13631C370,3DF2BEDE6A8FDFA7F97B97FFF6EF38CC 516EE9C34B839...,12E43E51784BDE3CB9E0EF6310A7D5C5,9FD001258907F541D497040C64383696
3,0,-1.0,-1,2020-08-03 20:34:28,-1,0.0,-1,D7D1FB49049702BF6338894757E0D959,-1,-1,...,-1,-1,-1,-1,-1,57A1D462A03BD076E029CF9310C11FC5,080614393A57816D7A655695E2DBE728,-1,E3DDEB04F8AFF944B11943BB57D2F620,225741ACF2DBB2E5948268F5D5D352E6
4,0,-1.0,-1,2020-08-03 20:54:17,1064,0.0,4C90FD52FC53D2C1C205844CB69575AB,7E56C27BFF0305E788DA55A029EC4988,-1,1B491180398E2F0390E6A588B3BCE291,...,1E629AECC2FB9BEF43331CBE8F2D7C08,341C9BD18A3277E6B104CAFC177DE796,-1,-1,-1,989EEF92F2A525DD896557425EA7C4C7,350271BAFD7C7AAF6FB1424CF3DD4827,097C90F8BF5398AC7C486804F0801DEE E09E084DB8937...,F0FD783189F55BAFC331AD347EAE6863,160A90377E54124D0BD31DB6735F0B33


### Fill Missing Values with standard value

Fill missing values (based on project doc) with ```np.nan``` for numerical column and
```pd.NA``` for categorical column

In [3]:
def get_numerical_categorical_columns(data):
    numerical_columns_df = data.select_dtypes(include=np.number)
    numerical_column_names = numerical_columns_df.columns.tolist()

    column_types_df = pd.DataFrame(data.columns, columns=['column name'])

    column_types_df['numerical or categorical'] = np.where(column_types_df['column name'].isin(numerical_column_names),
                                                           'numerical', 'categorical')

    num_columns = column_types_df[column_types_df['numerical or categorical'] == 'numerical']['column name']
    cat_columns = column_types_df[column_types_df['numerical or categorical'] == 'categorical']['column name']

    return num_columns, cat_columns


numerical_columns, categorical_columns = get_numerical_categorical_columns(df)

df[numerical_columns] = df[numerical_columns].apply(lambda col: col.replace({-1: np.nan}))

df['product_price'] = df['product_price'].replace({0: np.nan})

df[categorical_columns] = df[categorical_columns].apply(lambda col: col.replace({'-1': np.nan}))

### Getting Statistical Insight

Find numerical and categorical columns and calculate statistics like:

- Mean, Std, Quantiles, Number(And Percentage) Of Missing Values and etc for each column
- Number of unique values and count number of values per each unique value for each categorical column


In [4]:
def numerical_columns_statistical_info(numerical_columns_df):
    return numerical_columns_df.describe().append(
        pd.Series(numerical_columns_df.isnull().sum(), name='n of missing values')
    ).append(
        pd.Series(numerical_columns_df.isnull().sum() / len(numerical_columns_df * 100), name='p of missing values')
    )


def categorical_columns_statistical_info(categorical_columns_df):
    values_count_df: pd.DataFrame = categorical_columns_df.apply(pd.Series.value_counts)
    values_count_df = values_count_df.fillna('-')

    values_count_df.loc['n of not exist values'] = categorical_columns_df.isnull().sum()
    values_count_df.loc['p of not exist values'] = categorical_columns_df.isnull().sum() / len(categorical_columns_df) * 100
    values_count_df.loc['n of unique values'] = categorical_columns_df.nunique()

    return values_count_df


numerical_columns_stats = numerical_columns_statistical_info(df[numerical_columns.tolist()])
categorical_columns_stats = categorical_columns_statistical_info(df[categorical_columns.tolist()])

In [5]:
numerical_columns_stats

Unnamed: 0,Sale,SalesAmountInEuro,time_delay_for_conversion,nb_clicks_1week,product_price,product_category(7)
count,100000.0,13661.0,13606.0,53940.0,12784.0,0.0
mean,0.13661,117.030507,321366.6,439.389006,85.491137,
std,0.343437,383.010444,588429.3,1541.251393,165.115302,
min,0.0,0.0,8.0,0.0,0.18,
25%,0.0,23.02379,697.0,6.0,15.83,
50%,0.0,51.87,4277.5,39.0,35.565,
75%,0.0,124.99,348467.5,198.0,87.42,
max,1.0,23691.22498,2554631.0,25390.0,3928.0,
n of missing values,0.0,86339.0,86394.0,46060.0,87216.0,100000.0
p of missing values,0.0,0.86339,0.86394,0.4606,0.87216,1.0


Now, we plot haetmap of correlation between numerical columns to see how correlated these features are.

In [None]:
corr = df[numerical_columns.tolist()].corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

As you see in the above heatmap, *product_price* has correlation about 0.3 with *nb_clicks_1week* and *SalesAmountInEuro* that is for now, the biggest correlation between two distinct features.

Histogram for numerical columns would give us more information about how distributed the values of these columns are

In [None]:
df[numerical_columns.tolist()].hist(bins=8, figsize=(10, 10));

As you see in above histograms, most of data in each numerical column is null value, which for these columns *0* is symbol of null data.

Also it'll be good that if we plot these numerical columns' pairplot to get a better insight about these kind of columns

In [None]:
sns.pairplot(df[numerical_columns.tolist()])

what we've just seen is only too much null data and sparse dataset.

In [6]:
categorical_columns_stats

Unnamed: 0,click_timestamp,product_age_group,device_type,audience_id,product_gender,product_brand,product_category(1),product_category(2),product_category(3),product_category(4),product_category(5),product_category(6),product_country,product_id,product_title,partner_id,user_id
0000FF7BF953B3D7281341F4C7B98E56,-,-,-,-,-,-,-,-,-,-,-,-,-,1.0,-,-,-
00011AB400BCC95F3CCAF0779669E61C,-,-,-,-,-,-,-,-,-,-,-,-,-,1.0,-,-,-
000158282ACE81B560694CBC1AA8EF8E,-,-,-,-,-,-,-,-,-,-,-,-,-,1.0,-,-,-
00017F6C780F948ED3D61F8312618978,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,1.0
000190215E00C3D5AF4E421CB2468C65,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FFFF8E415120D1B5424786E1085D12BD,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,1.0
FFFFE3EC68D8E06C3CE530F84D2E30FF,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,1.0
n of not exist values,0,75603,39,71793,75554,65800,45502,45517,52316,71222,92009,98832,23754,23714,45763,0,0
p of not exist values,0.0,75.603,0.039,71.793,75.554,65.8,45.502,45.517,52.316,71.222,92.009,98.832,23.754,23.714,45.763,0.0,0.0


For Categorical features, we draw barplot for different values these columns have, and as you saw before, some categorical columns have so many distinct values hence we consider these barplots only for 10 values which have maximum counts.

In [None]:
from tqdm import tqdm

fig, axs = plt.subplots(17, figsize=(50, 400))
for index, column in enumerate(tqdm(categorical_columns.tolist())):
    axs[index].set_title(column)
    axs[index].bar(df[column].value_counts().index[:10], df[column].value_counts()[:10])
plt.show()


### Columns are completely marked from target column

In [7]:
print(np.count_nonzero((df['Sale'] == 1) & (df['SalesAmountInEuro'] != -1)) / np.count_nonzero(df['Sale'] == 1))
print(np.count_nonzero((df['Sale'] == 1) & (df['time_delay_for_conversion'] != -1)) / np.count_nonzero(df['Sale'] == 1))

1.0
1.0


### Drop Columns

Description of the columns that will be dropped:
- SalesAmountInEuro: This column is completely marked
from column Sale. so i drop this column. (86% missing)
- time_delay_for_conversion: This column is almost
completely marked from column Sale. so i also drop this column. (86% missing)
- click_timestamp: i extract day and hour from this column and then drop it.
- product_category(7): (100% missing)
- audience_id: doc didnt explain this column.
- product_title: this column has ~27k unique value while dataset size has 100k rows. so
this column isnt useful.
- user_id: this column has ~97k unique value while dataset size has 100k rows. so
this column isnt useful.
- product_id: this column has ~45k unique value while dataset size has 100k rows. so
this column isnt useful.

> Sale: this is target column and will be appended to dataset after eda and feature engineering done.

In [8]:

target_column_name = 'Sale'
target = df[target_column_name]

# df['click_timestamp'] = pd.to_datetime(df['click_timestamp'])
# df['day'] = df['click_timestamp'].dt.day
# df['hour'] = df['click_timestamp'].dt.hour

df.drop(
    [
        target_column_name,
        'product_category(7)',
        'SalesAmountInEuro',
        'time_delay_for_conversion',
        'audience_id',
        'click_timestamp',
        'product_title',
        'user_id',
        'product_id',
    ],
    axis=1, inplace=True
)

### Drop Rows have less than ```threshold``` non-missing values in columns

In [9]:
threshold = 6
df = df.dropna(thresh=threshold)

### Fill Missing Values

Fill categorical columns missing values with column median and numerical columns with column mode (most frequent value)

In [10]:
numerical_columns, categorical_columns = get_numerical_categorical_columns(df)

for column in numerical_columns:
    df.loc[df[column].isnull(), column] = df.loc[~df[column].isnull(), column].median()

for column in categorical_columns:
    df.loc[df[column].isnull(), column] = df.loc[~df[column].isnull(), column].mode().iat[0]

I also used ```IterativeImputer``` from ```sklearn``` to
fill missing values but its not better than basic last strategy.

```
to_impute_columns = categorical_columns

initial_strategy = 'most_frequent'
# estimator = RandomForestClassifier(n_estimators=5, random_state=0)
estimator = KNeighborsClassifier(n_neighbors=9)
imputer: IterativeImputer = IterativeImputer(random_state=0,
                                             estimator=estimator,
                                             initial_strategy=initial_strategy,
                                             max_iter=10,
                                             verbose=2)

imputed = imputer.fit_transform(data_frame[to_impute_columns])
data_frame.loc[:, to_impute_columns] = imputed

# Numeric Columns
to_impute_columns = to_impute_columns.append(numerical_columns)

initial_strategy = 'median'
estimator = KNeighborsRegressor(n_neighbors=9)
imputer: IterativeImputer = IterativeImputer(random_state=0,
                                             estimator=estimator,
                                             initial_strategy=initial_strategy,
                                             max_iter=10,
                                             verbose=2)

imputed = imputer.fit_transform(data_frame[to_impute_columns])
data_frame.loc[:, to_impute_columns] = imputed
```

> [Handle Missing Values](https://towardsdatascience.com/7-ways-to-handle-missing-values-in-machine-learning-1a6326adf79e)

### Encode Categorical Columns

I encode categorical columns with ```LabelEncoder``` class of ```
sklearn``` lib. i dont use the One-Hot strategy because the number of unique values in some columns is too large.
label encoder strategy works in this regard and greatly reduces the amount of data.

> [Encode Categorical Columns](https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/)

In [11]:
# Categorical Columns Encode
for c_column in categorical_columns:
    le = LabelEncoder()
    encoded: pd.Series = le.fit_transform(df.loc[~df[c_column].isnull(), c_column])
    df.loc[~df[c_column].isnull(), c_column] = encoded

I also used ```OneHotEncoder``` from ```sklearn``` lib
 but the result was no different from the one mentioned above.
 In addition, this method will require less memory to save.

```
enc = OneHotEncoder()
enc_df = pd.DataFrame(enc.fit_transform(data_frame[categorical_columns]).toarray())
data_frame.drop(categorical_columns, axis=1, inplace=True)
data_frame = data_frame.join(enc_df)
```

### Data Normalization

I normalized the numerical columns (not categorical columns!) with the help of ```StandardScaler``` class of
```sklearn``` lib.
 ```StandardScaler``` applies the following conversion to
 the data:
### <center>$Tx = \frac{x - \mu}{\sigma}$</center>

> [Data Normalization](https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02)

In [12]:
# normalizer = StandardScaler()
# normalizer = MinMaxScaler()
# df.loc[:, numerical_columns] = normalizer.fit_transform(df[numerical_columns])

### Append Target Column

In [13]:
df[target_column_name] = target
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
data_frame = df[cols]

### Save Modified Dataset

In [14]:
print(f'numerical_columns len: {len(numerical_columns)}')
print(f'numerical_columns: {numerical_columns.tolist()}')

print(f'categorical_columns len: {len(categorical_columns)}')
print(f'categorical_columns: {categorical_columns.tolist()}')

print('dataset dimensions: {data_frame.shape[0]}')
pd.DataFrame.to_csv(data_frame,
                    'modified_dataset.csv',
                    sep='\t',
                    index=False,
                    header=False
                    )

numerical_columns len: 2
numerical_columns: ['nb_clicks_1week', 'product_price']
categorical_columns len: 12
categorical_columns: ['product_age_group', 'device_type', 'product_gender', 'product_brand', 'product_category(1)', 'product_category(2)', 'product_category(3)', 'product_category(4)', 'product_category(5)', 'product_category(6)', 'product_country', 'partner_id']
dataset dimensions: {data_frame.shape[0]}


***
# Model

I have taught three models for this project.
These three models are as follows:

- Extreme Deep Factorization Model
> [Extreme Deep Factorization Model](https://towardsdatascience.com/extreme-deep-factorization-machine-xdeepfm-1ba180a6de78)

- Wide And Deep Model
> [Wide And Deep Model](https://medium.com/analytics-vidhya/wide-deep-learning-for-recommender-systems-dc99094fc291)

- XGBoost Model
> [XGBoost Model](https://medium.com/sfu-cspmp/xgboost-a-deep-dive-into-boosting-f06c9c41349)






The first two models use the ```pytorch-fm``` lib and
the third model uses the ```xgb``` library,
which is installed with the help of the following commands.

## Extreme Deep Factorization Model (xDeepFM)

Each model is described in detail in the following sections:

### Hybrid Approach (DNN + FM)

The current recommendation landscape is
dominated by FM/DNN based models. But
some good hybrid architectures which fuse FM
and DNN based systems are also coming up.

1. Factorization Machine (FM) based approach

    - Learns pattern on combinatorial features automatically
    - Generalize well to unseen features
    - Tries to capture all feature interactions which results in learning of useless interactions. This might introduce noise.

2. Deep Neural Network (DNN) based approach

    - Learns sophisticated and selective feature interactions
    - Feature interactions are modeled at an elemental level. One hot encoding is used for categorical variables to represent them in dimension D. This will be fed into a fully connected layer. This is in stark contrast to FM based approaches which models feature interactions at a vector level (User vector * Item Vector).

3. Hybrid approach (DNN + FM)

    - Learns both low/high order feature interactions which can capture both memorization and generalization.

    > Memorisation can be loosely defined as learning the frequent co-occurrence of items or features and exploiting the correlation available in the historical data.
    > 1. Memorisation of feature interactions through a wide set of cross-product feature transformations are effective and interpretable, while generalisation requires more feature engineering effort.
    > 2. Recommendations based on memorisation are usually more topical and directly relevant to the items on which users have already performed actions.
    > 3. Memorisation can be achieved effectively using cross-product transformations over sparse features. This explains how the co-occurrence of a feature pair correlates with the target label.
    > 4. One limitation of cross-product transformations is that they do not generalise to query-item feature pairs that have not appeared in the training data.
    > 5. Wide linear models can effectively memorise sparse feature interactions using cross-product feature transformations.
    >  Generalisation, on the other hand, is based on transitivity of correlation and explores new feature combinations that have never or rarely occurred in the past.
    > 1. With less feature engineering, deep neural networks can generalise better to unseen feature combinations through low-dimensional dense embeddings learned for the sparse features.
    > 2. However, deep neural networks with embeddings can over-generalise and recommend less relevant items when the user-item interactions are sparse and high-rank.
    > 3. Generalisation tends to improve the diversity of the recommended items. Generalisation can be added by using features that are less granular , but manual feature engineering is often required.
    > 4. For massive-scale online recommendation and ranking systems in an industrial setting, generalised linear models such as logistic regression are widely used because they are simple, scalable and interpretable. The models are often trained on binarised sparse features with one-hot encoding.

    > [Generalization And Memorization](https://medium.com/analytics-vidhya/memorization-and-deep-neural-networks-5b56aa9f94b8 )

### Extreme Deep Factorization Machine

#### Model Architectures

![xDeepFM Model Architectures](figures/xdfm.png)
We omit the description of
the FM component for
brevity.

> [Factorization Machine](https://d2l.ai/chapter_recommender-systems/fm.html)

xDeepFM comprises of 3 parts:
1. The linear model ( Directly work on top of raw input features )
2. Plain DNN (Works on top of dense feature embeddings)
3. Introducing Compressed Interaction Network (CNN) (Works on top of dense feature embeddings)
Out of these 3, CIN is unique to xDeepFM.

CIN characteristics:

CIN is inducted by xDeepFM due to the following benefits:
- It learns feature interactions at a vector wise level, not at a bitwise level.
- It measures high order feature interactions explicitly.
- Its complexity does not grow exponentially with the degree of interactions.

> [Compressed Interaction Network](https://towardsdatascience.com/recsys-series-part-5-neural-matrix-factorization-for-collaborative-filtering-a0aebfe15883)


Linear, CIN, and DNN are all trained parallelly

Linear model, plain DNN, and CIN
are trained in parallel, and the output of
the model is to apply the sigmoid function to the linear output of these three models.

#### Trained Model Architectures

##### TorchViz Result:
![Extreme Deep Factorization Machine](deep_model_archits/Extreme%20Deep%20Factorization%20Model%20tz%20result.png)

##### Hidden Layer Result:
![Extreme Deep Factorization Machine](deep_model_archits/Extreme%20Deep%20Factorization%20Model%20hl%20result.png)

#### Implement

xDeepFM model has been implemented in the DeepModel.py (set DEEP_MODEL_NAME parameter to XDFM_STR constant) and
With the help of following command, xDeepFM model is trained:

> The code has been commented.

#### Hyper-parameters tuning

Model hyper-parameters tuning is
 done with the ```optuna```
 library.

The BayesianSearch algorithm used in this
section has better performance than GridSearch and RandomSearch algorithm.

Bayesian optimization methods are efficient because they select hyperparameters in an informed manner. By prioritizing hyperparameters that appear more promising from past results, Bayesian methods can find the best hyperparameters in lesser time (in fewer iterations) than both grid search and random search.

This library

> [Grid search, Random search, Bayesian optimization](https://medium.com/analytics-vidhya/comparison-of-hyperparameter-tuning-algorithms-grid-search-random-search-bayesian-optimization-5326aaef1bd1#:~:text=Bayesian%20optimization%20methods%20are%20efficient,grid%20search%20and%20random%20search.)

```
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())

study.optimize(objective_function, n_trials=15)

def objective_function(trial: optuna.Trial):
    params = {
        'learning_rate': trial.suggest_discrete_uniform('learning_rate', 1e-4, 6 * 1e-4, q=1 * 1e-4),
        'eps': trial.suggest_categorical('eps', [1e-8, ]),
        'weight_decay': trial.suggest_discrete_uniform('weight_decay', 1 * 1e-6, 5 * 1e-6, q=1 * 1e-6),
        'dropout': trial.suggest_discrete_uniform('dropout', 0.15, 0.35, q=0.1),
        'batch_size': trial.suggest_categorical('batch_size', [512]),
        'amsgrad': trial.suggest_categorical('amsgrad', [False, True]),
        'epochs': trial.suggest_categorical('epochs', [25]),
    }
    # Code logic
```

#### Result

With the help of ```MLFlow```
 Library, viewing
the results is very simple. To view
the results, just run the following command in
the terminal. (In extreme deep factorization machine model section.)

To view the results, the following link should be opened in the browser:

##### Metrics and Parameters:

Metrics are used in the evaluation is:

- F-Score
- Accuracy
- AUC (Area Under Curve)
- Binary Cross Entropy Loss


![Metrics and Parameters](results_image/xdfm_metrics_result.png)

A good fit is the goal of the learning algorithm and exists between an overfit and underfit model.

A good fit is identified by a training and validation loss that decreases to a point of stability with a minimal gap between the two final loss values.

The loss of the model will almost always be lower on the training dataset than the validation dataset. This means that we should expect some gap between the train and validation loss learning curves. This gap is referred to as the “generalization gap.”

A plot of learning curves shows a good fit if:

- The plot of training loss decreases to a point of stability.
- The plot of validation loss decreases to a point of stability and has a small gap with the training loss.
Continued training of a good fit will likely lead to an overfit.

##### Loss And F1-Score in epochs (Tunned Model):
![Loss And F1-Score](results_image/xdfm_perepoch.png)

Learning curves (LCs) are deemed effective tools for monitoring the performance of workers exposed to a new task. LCs provide a mathematical representation of the learning process that takes place as task repetition occurs.

##### Comparing Runs:
![Comparing Runs](results_image/xdfm_comaring_models.png)

##### Hyperparameters Tuning Parallel Coordinates Plot:
![Hyperparameters Tuning Parallel Coordinates Plot](results_image/xdfm_p_plot.png)

***

## Wide & Deep Model

#### Model Architectures

![Wide & Deep Model Architectures](figures/w&dm.png)

In generalisation scenario Embedding-based models, such as factorization machines or deep neural networks, can generalize to previously unseen query-item feature pairs by learning a low-dimensional dense embedding vector for each query and item feature, with less burden of feature engineering. However, it is difficult to learn effective low-dimensional representations for queries and items when the underlying query-item matrix is sparse and high-rank, such as users with specific preferences or niche items with a narrow appeal.
In such cases, there should be no interactions between most query-item pairs, but dense embeddings will lead to nonzero predictions for all query-item pairs, and thus can over-generalize and make less relevant recommendations.
In memorisation scenario
On the other hand, linear models with cross-product feature transformations can memorize these “exception rules” with much fewer parameters.
deep neural networks can generalize to previously unseen feature interactions through low dimensional embeddings.

1. **Wide Component**: The wide component is a generalized linear model
    of the form $\mathbf{y = w^t x + b}$, as illustrated
    in above Figure (left).
    y is the prediction, $\mathbf{x = [x_1, x_2, …, x_d]}$ is a vector
    of d features, $\mathbf{w = [w_1, w_2, …, w_d]}$ are the model parameters
    and b is the bias. The feature set includes raw input
    features and transformed. One of the most important
    transformations is the cross-product transformation,
    which is defined as:

    ### <center>$\phi_k (x) = \displaystyle{\prod_{i=1}^{d}} \; x_i^{c_{k_i}}$</center>

2. **Deep Component**
    The deep component is a feed-forward neural network,
    as shown in Above Figure (right). For categorical features,
    the original inputs are feature strings
    (e.g., “language=en”). Each of these sparse,
    high-dimensional categorical features are first converted
    into a low-dimensional and dense real-valued vector,
    often referred to as an embedding vector. The dimensionality
    of the embeddings are usually on the order of O(10) to
    O(100). The embedding vectors are initialized randomly and
    then the values are trained to minimize the final loss
    function during model training. These low-dimensional
    dense embedding vectors are then fed into the hidden layers
    of a neural network in the forward pass. Specifically,
    each hidden layer performs the following computation:

    ### <center>$a^{(l+1)} = f(W^{(l)}a^{(l)} + b^{(l)}) $</center>

    where l is the layer number and $f$
    is the activation function, often ReLUs.
    $a^{(l)}, b^{(l)}, W^{(l)}$ are the activations, bias, and model weights at l-th layer.

    deep neural networks can generalize to previously unseen feature interactions through low dimensional embeddings.

    > [Wide And Deep Model Addition Source](https://medium.com/analytics-vidhya/wide-deep-learning-for-recommender-systems-dc99094fc291)

#### Trained Model Architectures

##### TorchViz Result:
![Wide & Deep Model](deep_model_archits/Wide%20&%20Deep%20Model%20tz%20result.png)

##### Hidden Layer Result:
![Wide & Deep Model](deep_model_archits/Wide%20&%20Deep%20Model%20hl%20result.png)

#### Implement

Wide & Deep model has been implemented in the DeepModel.py (set DEEP_MODEL_NAME parameter to WADM_STR constant) and
With the help of following command, xDeepFM model is trained:

> The code has been commented.

#### Hyper-parameters tuning

Model hyper-parameters tuning is
 done with the ```optuna```
 library.

The BayesianSearch algorithm used in this
section has better performance than GridSearch and RandomSearch algorithm.

Bayesian optimization methods are efficient because they select hyperparameters in an informed manner. By prioritizing hyperparameters that appear more promising from past results, Bayesian methods can find the best hyperparameters in lesser time (in fewer iterations) than both grid search and random search.

This library

> [Grid search, Random search, Bayesian optimization](https://medium.com/analytics-vidhya/comparison-of-hyperparameter-tuning-algorithms-grid-search-random-search-bayesian-optimization-5326aaef1bd1#:~:text=Bayesian%20optimization%20methods%20are%20efficient,grid%20search%20and%20random%20search.)

```
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())

study.optimize(objective_function, n_trials=15)

def objective_function(trial: optuna.Trial):
    params = {
        'learning_rate': trial.suggest_discrete_uniform('learning_rate', 1e-4, 6 * 1e-4, q=1 * 1e-4),
        'eps': trial.suggest_categorical('eps', [1e-8, ]),
        'weight_decay': trial.suggest_discrete_uniform('weight_decay', 1 * 1e-6, 5 * 1e-6, q=1 * 1e-6),
        'dropout': trial.suggest_discrete_uniform('dropout', 0.15, 0.35, q=0.1),
        'batch_size': trial.suggest_categorical('batch_size', [512]),
        'amsgrad': trial.suggest_categorical('amsgrad', [False, True]),
        'epochs': trial.suggest_categorical('epochs', [35]),
    }
    # Code logic
```

#### Result

With the help of ```MLFlow```
 Library, viewing
the results is very simple. To view
the results, just run the following command in
the terminal. (In wide and deep model section.)

To view the results, the following link should be opened in the browser:

##### Metrics and Parameters:


Metrics are used in the evaluation is:

- F-Score
- Accuracy
- AUC (Area Under Curve)
- Binary Cross Entropy Loss


![Metrics and Parameters](results_image/wadm_metrics_result.png)

A good fit is the goal of the learning algorithm and exists between an overfit and underfit model.

A good fit is identified by a training and validation loss that decreases to a point of stability with a minimal gap between the two final loss values.

The loss of the model will almost always be lower on the training dataset than the validation dataset. This means that we should expect some gap between the train and validation loss learning curves. This gap is referred to as the “generalization gap.”

A plot of learning curves shows a good fit if:

- The plot of training loss decreases to a point of stability.
- The plot of validation loss decreases to a point of stability and has a small gap with the training loss.
Continued training of a good fit will likely lead to an overfit.

##### Loss And F1-Score in epochs (Tunned Model):
![Loss And F1-Score](results_image/wadm_perepoch.png)

Learning curves (LCs) are deemed effective tools for monitoring the performance of workers exposed to a new task. LCs provide a mathematical representation of the learning process that takes place as task repetition occurs.


##### Comparing Runs:
![Comparing Runs](results_image/wadm_comparing_models.png)

##### Hyperparameters Tuning Parallel Coordinates Plot:
![Hyperparameters Tuning Parallel Coordinates Plot](results_image/wadm_p_plot.png)

***

## XGBoost Model

![XGBoost Model](figures/xgb.jpeg)

XGBoost is a decision-tree-based
ensemble Machine Learning algorithm that
uses a gradient boosting framework.
In prediction problems involving
unstructured data (images, text, etc.)
artificial neural networks tend to
outperform all other algorithms or
frameworks. However, when it comes
to small-to-medium structured/tabular
data, decision tree based algorithms
are considered best-in-class
right now.

![XGBoost Model](figures/xgb2.png)

1. **Decision Tree**: Every hiring manager has a set of criteria such as education level, number of years of experience, interview performance. A decision tree is analogous to a hiring manager interviewing candidates based on his or her own criteria.
2. **Bagging**: Now imagine instead of a single interviewer, now there is an interview panel where each interviewer has a vote. Bagging or bootstrap aggregating involves combining inputs from all interviewers for the final decision through a democratic voting process.
3. **Random Forest**: It is a bagging-based algorithm with a key difference wherein only a subset of features is selected at random. In other words, every interviewer will only test the interviewee on certain randomly selected qualifications (e.g. a technical interview for testing programming skills and a behavioral interview for evaluating non-technical skills).
4. **Boosting**: This is an alternative approach where each interviewer alters the evaluation criteria based on feedback from the previous interviewer. This ‘boosts’ the efficiency of the interview process by deploying a more dynamic evaluation process.
5. **Gradient Boosting**: A special case of boosting where errors are minimized by gradient descent algorithm e.g. the strategy consulting firms leverage by using case interviews to weed out less qualified candidates.
6. **XGBoost**: Think of XGBoost as gradient boosting on ‘steroids’ (well it is called ‘Extreme Gradient Boosting’ for a reason!). It is a perfect combination of software and hardware optimization techniques to yield superior results using less computing resources in the shortest amount of time.

#### Performance Comparison

![XGBoost Performance Comparison](figures/xgb3.jpeg)

As demonstrated in the chart above, XGBoost model has the best combination of prediction performance and processing time compared to other algorithms. Other rigorous benchmarking studies have produced similar results. No wonder XGBoost is widely used in recent Data Science competitions.

> [XGBoost Algorithm](https://towardsdatascience.com/https-medium-com-vishalmorde-xgboost-algorithm-long-she-may-rein-edd9f99be63d)

#### Implement

XGBoost model has been implemented in the XGBModel.py and
With the help of following command, XGBoost model is trained:

> The code has been commented.

#### Hyper-parameters tuning

Model hyper-parameters tuning is
 done with the ```scikit-optimize```
 library.

The BayesianSeach algorithm used in this
section has better performance than GridSearch and RandomSearch algorithm.

Bayesian optimization methods are efficient because they select hyperparameters in an informed manner. By prioritizing hyperparameters that appear more promising from past results, Bayesian methods can find the best hyperparameters in lesser time (in fewer iterations) than both grid search and random search.

This library

> [Grid search, Random search, Bayesian optimization](https://medium.com/analytics-vidhya/comparison-of-hyperparameter-tuning-algorithms-grid-search-random-search-bayesian-optimization-5326aaef1bd1#:~:text=Bayesian%20optimization%20methods%20are%20efficient,grid%20search%20and%20random%20search.)

```
cross_validation_n_fold = 5
opt = BayesSearchCV(
    xgb.XGBClassifier(),
    search_spaces={
        'n_estimator': Integer(low=500, high=1500, prior='uniform'),
        'max_depth': Integer(low=5, high=10, prior='uniform'),
        'booster': Categorical(['dart', 'gbtree']),
        'use_label_encoder': Categorical([False]),
        'eval_metric': Categorical(['logloss']),
    },
    n_iter=50,
    cv=cross_validation_n_fold,
    scoring=lambda estimator, X, y: f1_score(y, estimator.predict(X)),
    return_train_score=True,
    verbose=0
)
```

This library also supports Cross-Validation and
 has been used in this project.

#### Result

With the help of ```MLFlow```
 Library, viewing
the results is very simple. To view
the results, just run the following command in
the terminal.

To view the results, the following link should be opened in the browser:

##### Metrics and Parameters:

Metrics are used in the evaluation is:

- F-Score
- Accuracy
- AUC (Area Under Curve)
- Log Loss

![Metrics and Parameters](results_image/xgbmetrics_result.png)

##### Comparing Runs:
![Comparing Runs](results_image/xgb_comparing_models.png)

##### Hyperparameters Tuning Parallel Coordinates Plot:
![Hyperparameters Tuning Parallel Coordinates Plot](results_image/xgb_p_plot.png)


## Containers Pipeline and Deployment

For pipeline instructions please visit our github repository: