In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from numpy import savetxt
from scipy import stats
from keras import Sequential
from keras.layers.core import Dense, Dropout
import tensorflow as tf
from keras.wrappers.scikit_learn import KerasClassifier

## **1. Data Preprocessing**

### **1.1. Preprocessing of training data**

In [None]:
df_train = pd.read_csv("/kaggle/input/les-variables-des-images-de-publicit/data_train.csv", sep='\t') #dataframe object

In [None]:
print(df_train.shape)

#### **1.1.1. Removing the duplicate values**

In [None]:
# Select duplicate rows except first occurrence based on all columns:
df_duplicateRows = df_train[df_train.duplicated()]
print("Duplicate rows:", df_duplicateRows)
print(df_duplicateRows.shape[0]) # 562 rows / 2459 rows

# Remove duplicate rows:
df_train = df_train.drop_duplicates() # 2459 rows - 562 rows = 1897 rows
# print(df_train.shape)

In [None]:
df_train_tr= df_train.transpose()
df_tr_duplicateCols = df_train_tr[df_train_tr.duplicated()]
# print("Duplicate columns:", df_duplicateCols)
print(df_tr_duplicateCols.shape[0]) # 820 cols / 1559 cols

In [None]:
# Remove duplicate columns:
df_train_tr = df_train_tr.drop_duplicates() # 1559 cols - 820 rows = 739 cols
df_train = df_train_tr.transpose()
print(df_train.shape)

#### **1.1.2. Filling the missing values**

#### Ratio of the missing values:

In [None]:
# get the number of missing data points per column
missing_values_count = df_train.isna().sum()
print("missing values count:\n", missing_values_count)

# how many total missing values do we have?
total_cells = np.product(df_train.shape)
total_missing = missing_values_count.sum()

print("total_cells:", total_cells)
print("total_missing:",total_missing)

#### Fill the missing values in first three columns and with the median of these columns:

In [None]:
median_x1 = df_train['X1'].median()
median_x2 = df_train['X2'].median()
median_x3 = df_train['X3'].median()
median_x4 = df_train['X4'].median()

# print("median_x1:", median_x1) # 60.0
# print("median_x2:", median_x2) # 114.0
# print("median_x3:", median_x3) # 2.2708
# print("median_x4:", median_x4) # 1.0

df_train['X1'].fillna(median_x1, inplace=True)
df_train['X2'].fillna(median_x2, inplace=True)
df_train['X3'].fillna(median_x3, inplace=True)
df_train['X4'].fillna(median_x4, inplace=True)

### **1.2. Preprocessing of test data**

In [None]:
df_test = pd.read_csv("/kaggle/input/les-variables-des-images-de-publicit/data_test.csv", sep='\t') 

#### **1.2.1. Filling the missing values**

In [None]:
df_nans = df_test[df_test.isnull().any(axis=1)] # 394 rows have one or more NaN values
print(df_nans)

median_x1_t = df_test['X1'].median()
median_x2_t = df_test['X2'].median()
median_x3_t = df_test['X3'].median()
median_x4_t = df_test['X4'].median()

# print("median_x1:", median_x1_t) 
# print("median_x2:", median_x2_t) 
# print("median_x3:", median_x3_t) 
# print("median_x4:", median_x4_t) 

df_test['X1'].fillna(median_x1_t, inplace=True)
df_test['X2'].fillna(median_x2_t, inplace=True)
df_test['X3'].fillna(median_x3_t, inplace=True)
df_test['X4'].fillna(median_x4_t, inplace=True)

In [None]:
df_test.head() 

### **1. 3. Normalization of numeric dataframes of training and test data**

#### **1.3.1. Separating the numeric and boolean dataframes**

#### Creating two dataframes, as the first part is continuous variables (floating) and the second part is boolean variables :

In [None]:
df_train_fl = df_train.iloc[: , :4]
# print("df_pubData_Train_fl:\n", df_train_fl.head())

df_train_bool = df_train.iloc[: , 4:]
# print("boolean dataframe:\n", df_train_bool.head())

#### Same division of dataframes for the test data:

In [None]:
df_test_fl = df_test.iloc[: , :4]
# print("df_pubData_Train_fl:\n", df_test_fl.head())

df_test_bool = df_test.iloc[: , 4:]
# print("boolean dataframe:\n",df_test_bool.head())

#### **1. 3. 2. Removing outliers of numeric training dataframe**

#### Using the box plot below, we can easiliy visualize the outliers of the numerical columns as individual points.

In [None]:
# X1_column = df_test_fl["X1"]
# X2_column = df_test_fl["X2"]
# X3_column = df_test_fl["X3"]

# train_fl_columns = [X1_column, X2_column, X3_column]

# fig, ax = plt.subplots()
# ax.boxplot(train_fl_columns)
# plt.show()

#### The script below computes the Z-score of each value in the column, relative to the column mean and standard deviation.
#### While calculating the Z-score, we re-scale and center the data and look for data points which are too far from zero. If the Z-score value is greater than or less than 3 or -3 respectively, that data point will be identified as outliers. 

In [None]:
''' we better NOT remove the outliers of the test dataset '''
# print(df_test_fl.shape)
# df_test_fl_filtered = df_test_fl[(np.abs(stats.zscore(df_test_fl)) < 3).all(axis=1)] # "all(axis=1)" ensures that all column satisfy the constraint.
# print(df_test_fl_filtered.shape)

In [None]:
print(df_train_fl.shape)
df_train_fl_filtered = df_train_fl[(np.abs(stats.zscore(df_train_fl)) < 3).all(axis=1)]
print(df_train_fl_filtered.shape)

In [None]:
# https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame
    
# X1_filtered = X1_column[X1_column.between(X1_column.quantile(.05), X1_column.quantile(.95))]
# print(X1_filtered.shape)
# X1_filtered.plot.box()

# X2_filtered = X2_column[X2_column.between(X2_column.quantile(.05), X2_column.quantile(.95))]
# print(X2_filtered.shape)
# X2_filtered.plot.box()

# X3_filtered = X3_column[X3_column.between(X3_column.quantile(.05), X3_column.quantile(.95))]
# print(X3_filtered.shape)
# X3_filtered.plot.box()

#### **1. 3. 3. Scaling of numeric dataframes of training and test data**

In [None]:
# define min max scaler
scaler = MinMaxScaler(feature_range=(0,1))

cols = ["X1", "X2", "X3", "X4"]
# transform train data
df_train_fl = pd.DataFrame(scaler.fit_transform(df_train_fl), columns = cols)
# transform test data
df_test_fl = pd.DataFrame(scaler.fit_transform(df_test_fl), columns = cols)

### **1. 4. Dimensionality reduction of training and test data**

#### While applying dimensionality reduction, first we fit the model using training data, and then we can use to transform the training and validation data.
#### (We can't apply oversampling to validation data.)

#### **1. 4. 1. SVD for the dimensionality reduction of sparse boolean features** 

#### "Singular Value Decomposition, or SVD, is one of the most popular techniques for dimensionality reduction for sparse data (data with many zero values)."
#### https://machinelearningmastery.com/dimensionality-reduction-algorithms-with-python/
#### https://machinelearningmastery.com/singular-value-decomposition-for-dimensionality-reduction-in-python/

#### Separating label column from training boolean dataframe:

In [None]:
label_train = df_train_bool.iloc[: , -1:] 
# label_train.shape # (1895, 1)
print(label_train.value_counts()) # 0: 1585, 1: 312

# drop the "outcome" column (binary label column):
df_train_bool = df_train_bool.iloc[: , :-1] 
df_train_bool.head()
# df_pubData_bool.shape # (1895, 1556)

#### **1. 4. 2. Optimizing "n_components" parameter in TruncatedSVD**

#### **For training boolean data:**

#### The script below belongs to Chris Albon: https://chrisalbon.com/machine_learning/feature_engineering/select_best_number_of_components_in_tsvd/

In [None]:
# Create and run an TSVD with one less than number of features
tsvd = TruncatedSVD(n_components=df_train_bool.shape[1]-1)
X_tsvd = tsvd.fit(df_train_bool)

# List of explained variances
tsvd_var_ratios = tsvd.explained_variance_ratio_

# Calculating number of components required to pass threshold
def select_n_components(var_ratio, goal_var: float) -> int:

    total_variance = 0.0
    n_components = 0
    
    for explained_variance in var_ratio:
        # Add the explained variance to the total
        total_variance += explained_variance
        # Add one to the number of components
        n_components += 1
        
        if total_variance >= goal_var:
            break
            
    return n_components

select_n_components(tsvd_var_ratios, 0.95) # 290

#### **For test boolean data:** 

In [None]:
# Create and run an TSVD with one less than number of features
tsvd = TruncatedSVD(n_components=df_test_bool.shape[1]-1)
X_tsvd = tsvd.fit(df_test_bool)

# List of explained variances
tsvd_var_ratios = tsvd.explained_variance_ratio_

# Calculating number of components required to pass threshold
def select_n_components(var_ratio, goal_var: float) -> int:

    total_variance = 0.0
    n_components = 0
    
    for explained_variance in var_ratio:
        # Add the explained variance to the total
        total_variance += explained_variance
        # Add one to the number of components
        n_components += 1
        
        if total_variance >= goal_var:
            break
            
    return n_components

select_n_components(tsvd_var_ratios, 0.95) # 206

#### **1. 4. 3. Applying SVD encoding to boolean dataframes:**

In [None]:
SVD_model = TruncatedSVD(n_components=290).fit(df_train_bool)
print(df_train_bool.shape)

df_train_bool_reduced = pd.DataFrame(SVD_model.transform(df_train_bool)) # pd.DataFrame is used to retain as data frame object
print(df_train_bool_reduced.shape)

# apply same transformation to df_test to boolean:

SVD_model = TruncatedSVD(n_components=206).fit(df_test_bool)
print(df_test_bool.shape)

df_test_bool_reduced = pd.DataFrame(SVD_model.transform(df_test_bool))
print(df_test_bool_reduced.shape)

In [None]:
# df_test_bool_reduced.head()

#### Putting dataframes of test data together:

In [None]:
# indexes are not matching after preprocessing so we need to drop:
df_test_bool_reduced.reset_index(drop=True, inplace=True)
df_test_fl.reset_index(drop=True, inplace=True)

# concatenation of boolean with numeric test dataframes:
x_test = pd.concat( [df_test_fl, df_test_bool_reduced], axis=1 )
print((x_test.shape)) # (820, 27)
x_test.head()

### **1. 5. Balancing the highly imbalanced dataset**

#### **1. 5. 1. Visualizing the ratio of classes**

In [None]:
target_count = df_train.outcome.value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

target_count.plot(kind='bar', title='Count (target)')

#### **1. 5. 2. Applying several oversampling methods and comparing with cross validation**

In [None]:
# oversample = RandomOverSampler(sampling_strategy='minority')
# x_over, y_over = oversample.fit_resample(df_train_bool, label_train)

# sm = SMOTE(random_state = 42)
# x_sm, y_sm = sm.fit_resample(df_train_bool, label_train)

# sm = ADASYN()
# x_syn, y_syn = sm.fit_resample(df_train_bool, label_train)

# # print("sizes before:", df_train_bool.shape, label_train.shape)
# # print("sizes after RandomOverSampler resampling:",x_over.shape, y_over.shape)
# # print("sizes after SMOTE resampling:",x_sm.shape, y_sm.shape)
# # print("sizes after ADASYN resampling:",x_syn.shape, y_syn.shape)

In [None]:
# # define the pipeline
# steps = [('svd', TruncatedSVD(n_components=272)), ('m', LogisticRegression())] 
# model = Pipeline(steps=steps)

# # evaluate model
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)

# y_pred_ROS = cross_val_predict(model, x_over, y_over, cv=cv, n_jobs=-1) # 0.97 recall for nonad
# print("classification report after RandomOverSampler:\n", classification_report(y_over, y_pred_ROS))

# y_pred_sm = cross_val_predict(model, x_sm, y_sm, cv=cv, n_jobs=-1)
# print("classification report after SMOTE:\n", classification_report(y_sm, y_pred_sm))

# y_pred_syn = cross_val_predict(model, x_syn, y_syn, cv=cv, n_jobs=-1)
# print("classification report after ADASYN:\n", classification_report(y_syn, y_pred_syn))

#### RandomOverSampler gives the highest recall and f1-score on boolean datasets, so it will be the sampling method to apply:

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')
# oversampling of boolean dataframe:
x_over_bool, y_over = oversample.fit_resample(df_train_bool_reduced, label_train) # df_train_bool_reduced before oversampling: (1897, 273), label_train: (1897, 1)
print((x_over_bool.shape), (y_over.shape))

#### Dataframe sizes before oversampling: (1897, 290), (1897, 1) ; after oversampling: (3170, 290), (3170, 1)

In [None]:
# oversampling of floating dataframe:
x_over_fl, y_over = oversample.fit_resample(df_train_fl, label_train)
print((x_over_fl.shape), (y_over.shape))

#### Putting dataframes of train data together:

In [None]:
# concatenation of boolean with numeric training dataframes:
x_over = pd.concat( [x_over_fl, x_over_bool], axis=1 )
# print((x_over.shape))
x_over.head()

## **2. Evaluating classification models with 10-fold cross validation**

#### À chaque itération, cross_val_predict générera un score métrique individuel pour ce lot. En fin de compte, il renverra k score pour chaque itération. 

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1)

model = LogisticRegression()
y_pred_logr = cross_val_predict(model, x_over, y_over, cv=cv, n_jobs=-1) 
print("classification report for logistic regression classifier:\n", classification_report(y_over, y_pred_logr))

In [None]:
model = DecisionTreeClassifier()
y_pred_dt = cross_val_predict(model, x_over, y_over, cv=cv, n_jobs=-1) 
print("classification report for decision tree classifier:\n", classification_report(y_over, y_pred_dt))

#### In the above confusion matrices, both methods are resulting with same *average accuracy* score. Since our data has a bias for minority class "nonad", we need to compare *recall* and *precision* values to be sure that class is correctly classified. For the non-ad class, *recall* value returns 0.93 with the decision tree classifier, and 0.98 with the logistic regression classifier. Therefore, the logistic regression classifier outperforms the decision tree classifier for this dataset.

## **3. Predictions based on several models**

### **3. 1. Fitting a logistic regression model**

In [None]:
print(x_over.shape)
print(y_over.shape)

In [None]:
logreg_model = LogisticRegression()
logreg_model.fit(x_over, y_over)

#### Predicting the test set results:

In [None]:
y_pred = logreg_model.predict(x_test) # output: numpy.ndarray

In [None]:
# # save to csv file
# savetxt('y_pred.csv', y_pred, delimiter=',', fmt=('%s'))

### **3. 2. Fitting a deep neural network model**

In [None]:
# training dataset: x_over, y_over
# testing dataset: x_test

# transform label dataset from string to integer https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html 
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
y_over = lb.fit_transform(y_over) # 'numpy.ndarray' object 

In [None]:
# Split the data to get validation dataset:
x_train, x_valid, y_train, y_valid = train_test_split(x_over, y_over, test_size=0.33, shuffle= True)
# print(x_train.shape)
# print(y_train.shape)
# print(x_valid.shape)
# print(y_valid.shape)

#### **To try in future:** We can model 2 different networks by using 2 different input sets we have (boolean and numeric) and then concatenate these at the end: 
https://www.pyimagesearch.com/2019/02/04/keras-multiple-inputs-and-mixed-data/ 

In [None]:
# how to choose between sigmoid or relu activation function?
#"Relu is less susceptible to vanishing gradients that prevent deep models from being trained, although it can suffer from other problems like saturated or “dead” units."
# https://machinelearningmastery.com/choose-an-activation-function-for-deep-learning/

# Dropout regularization to prevent overfitting https://machinelearningmastery.com/dropout-regularization-deep-learning-models-keras/
# "You are likely to get better performance when dropout is used on a larger network, giving the model more of an opportunity to learn independent representations."

# create a multilayer perceptron model with number of layers 7:
def create_model():
    model = Sequential()
    model.add(Dense(128, input_dim = x_train.shape[1], activation = 'sigmoid'))
    model.add(Dropout(.2))
    model.add(Dense(64, activation = 'relu'))
    model.add(Dropout(.2))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dropout(.2))
    model.add(Dense(16, activation = 'relu'))
    model.add(Dropout(.1))
    model.add(Dense(8, activation = 'relu'))
    model.add(Dense(2, activation = 'relu'))
    model.add(Dense(1, activation = 'sigmoid'))

# compile model:
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = [tf.keras.metrics.Recall(), tf.keras.metrics.Precision(), tf.keras.metrics.Accuracy()])
    # Binary cross-entropy loss is used for binary (0 or 1) classification applications. 
    return model

In [None]:
# evaluate model with standardized dataset
keras_model = KerasClassifier(build_fn = create_model, verbose=1) # "verbose=2" to see the training progress for each epoch.

# fit the keras model on the dataset
history = keras_model.fit(x_train, y_train, 
                    epochs=300, 
                    batch_size=50, 
                    validation_data=(x_valid, y_valid), 
                    shuffle=True)

In [None]:
# history_dict = history.history
# print(history_dict.keys())

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
y_pred = keras_model.predict(x_test) # output: numpy.ndarray
# print(y_pred)

In [None]:
# reverse transform the prediction to string value:
y_pred = lb.inverse_transform(y_pred, threshold=None)
# print(y_pred)