## Librairies

In [None]:
import numpy as np
import pandas as pd
import random

import datetime
import os

import bokeh

from bokeh.io import output_notebook
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, FactorRange, HoverTool, BasicTicker, ColorBar, LinearColorMapper
from bokeh.palettes import Spectral11, colorblind, Inferno, BuGn, brewer, Category20, Viridis256
from bokeh.layouts import row, column, grid



import tensorflow as tf
from tensorflow import keras

import seaborn as sns
import matplotlib.pyplot as plt
output_notebook()

For this month, let's explore the data and create a basic classifier using Tensorflow.

- ***First Question : Among these 100 features, are they all useful? How can we use them?***
- ***Second Question : How to create simple data pipeline and model using Tensorflow?***


Let's check this

## Load data

In [None]:
files = []

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


train_df = pd.read_csv(files[1], index_col = 'id')

test_df = pd.read_csv(files[2], index_col = 'id')


train_target = train_df.pop('target')

In [None]:
train_df.head(5)

In [None]:
print(train_df.isnull().sum())

# we don't need to fill empty values

Let's explore the data distribution for each features

In [None]:
def plot_extremum(dataframe):

    list_x = dataframe.max().to_list()
    list_y = dataframe.min().to_list()

    desc = dataframe.columns

    source  = ColumnDataSource(data = dict( x= list_x, y= list_y, desc = desc))

    hover = HoverTool(tooltips = [
        ("(x,y)" , "(@x, @y)"),
        ('desc', '@desc')
    ])

    p = figure(width = 800, height = 800, tools = [hover], title = 'Exploring possible outilers for each features', toolbar_location = 'right')
    p.circle( 'x', 'y', size = 15, alpha = 0.4 ,source = source, fill_color = 'navy')
    
    return p

p = plot_extremum(train_df)

show(p)

It clearly appears that features : `f35`, `f2` and `f44` might have some outilers, let's expect them closely.


For more information, we built a scatter plot mapping (min,max) for each features, <br>
Even though, we can't conclude anything with **ONLY** this plot <br>
It might help when handling a large number of features, to rapidly gather some information

In [None]:
f2_val = train_df['f2'].sample(10000).to_list()
f35_val = train_df['f35'].sample(10000).to_list()
f44_val = train_df['f44'].sample(10000).to_list()

zeros = list(np.zeros(len(f2_val)))

source_1 = ColumnDataSource(data =dict(x = f2_val, y = zeros))    


source_2 = ColumnDataSource(data =dict(x = f35_val, y = zeros))  

source_3 = ColumnDataSource(data =dict(x = f44_val, y = zeros))  




p= figure(width = 500, height = 400, title = 'Distribution of f2 values', toolbar_location = 'right')
p1= figure(width = 500, height = 400, title = 'Distribution of f35 values', toolbar_location = 'right')
p2= figure(width = 500, height = 400, title = 'Distribution of f44 values', toolbar_location = 'right')


p.circle('x', 'y' , source = source_1, size =6,  fill_color = 'black', line_color = 'grey')
p1.circle('x', 'y' , source = source_2, size =6,  fill_color = 'black', line_color = 'grey')
p2.circle('x', 'y' , source = source_3, size =6,  fill_color = 'black', line_color = 'grey')


show(row([p,p1,p2]))

Finally,  these three features are just not on the same scale as others, so we must scale them down. <br>

However before doing so, we will analyse the other features

In [None]:
list_x = train_df.drop(columns = ['f2','f35', 'f44']).max().to_list()
list_y = train_df.drop(columns = ['f2','f35', 'f44']).min().to_list()

desc = train_df.drop(columns = ['f2','f35', 'f44']).columns

source  = ColumnDataSource(data = dict( x= list_x, y= list_y, desc = desc))



hover = HoverTool(tooltips = [
    ("(x,y)" , "(@x, @y)"),
    ('desc', '@desc')
])


p = figure(width = 600, height = 600, tools = [hover], title = 'Exploring possible outilers for each features', toolbar_location = 'right')
p.circle( 'x', 'y', size = 15, alpha = 0.4 ,source = source, fill_color = 'navy', line_color = 'white')

show(p)

Most of our features contains values between `[-5,10]` <br>
To go further we could analyse each feature not in this range, to make sure they are not outliers <br>

We will skip this part here

## A close look at distributions


Let's analyse the group of features between `[-5,10]` using violin plot! <br>
- For visiblity we are going to plot only 10 features randomly choosen

In [None]:
def plot_violin(dataframe,n = 10):

    select_feat = dataframe.max().loc[lambda val : val <= 10]

    feat_index = select_feat.index.to_list()

    rand_feat = random.sample(feat_index,n)

    distrib_df = dataframe[rand_feat].sample(10000).melt(var_name ='Column', value_name = 'Raw')


    plt.figure(figsize=(16,8))
    sns.violinplot(x = 'Column', y = 'Raw', data = distrib_df)
    
    plt.show()
    
plot_violin(train_df)

We cannot learn much with this graph, values are too spread, mostly because we have a lot of values, <br>
So having outliers is most likely to occur <br>



As expected in these kind of problems, we are going to normalise

- *We could also standardize our data* 

In [None]:
means_df = train_df.mean()

std_df = train_df.std()


normalised_df = (train_df - means_df) /std_df

Let's quickly re-run our previous visualisations 

In [None]:
p5 = plot_extremum(normalised_df)


show(p5)

In [None]:
plot_violin(normalised_df,10)

Even if several features have some huge outliers, let's keep it like this for the moment

___

## Building a simple Neural Net

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
BATCH_SIZE = 256

EPOCHS = 30


NUMBER_FEATURE = 100

In [None]:
x_train, x_val, y_train, y_val = train_test_split(np.asarray(normalised_df),
                                                  np.asarray(train_target),
                                                  test_size = 0.1)


def build_dataset(features_matrix, target):
    
    dataset = tf.data.Dataset.from_tensor_slices((features_matrix, target))
    
    dataset = dataset.shuffle(buffer_size = 1000).batch(BATCH_SIZE, drop_remainder = True).prefetch(1)
    
    return dataset




train_dataset = build_dataset(x_train, y_train)

val_dataset = build_dataset(x_val,y_val)

1. We use `tf.data.dataset` to build our datasets, thus we can apply functions on them much faster
2. the **`batch_size`** and **`buffer_size`**  are chosen arbitrarily since we are working on kaggle's notebook



___


- Regarding the model, *Dropout layers* are used to simply the model and thus reduce overfitting, 
- 4 hidden layers were used, be you can obtain more or less the same accuracy with only 3
- The Dropout rate wasn't really tweaked here.
- I didn't tried adam optimizer, you maybe can achieve better performances with
- LROnPlateau is usually usefull the gain some performances, but to obtain even better result creating your own learning_rate schedule is the best solution

In [None]:
def build_model(train_dataset, val_dataset, input_size):
    
    model = tf.keras.Sequential([
        keras.layers.InputLayer(input_shape = (input_size,)),
        keras.layers.Dense(units = 128, activation = 'relu'),
        keras.layers.Dropout(0.4),
        keras.layers.Dense(units = 64, activation = 'relu'),
        keras.layers.Dropout(0.4),
        keras.layers.Dense(units = 32, activation = 'relu'),
        keras.layers.Dropout(0.4),
        keras.layers.Dense(units = 16, activation = 'relu'),
        keras.layers.Dense(units = 1, activation = 'sigmoid')

    ])
    
    
    
    model.compile(optimizer = keras.optimizers.SGD(learning_rate = 4e-3,momentum = 0.9, nesterov = True),
             loss = 'binary_crossentropy',
             metrics = ['accuracy'])

    reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor ='val_loss', patience = 3, factor = 0.1, verbose = 1)

    history = model.fit(train_dataset,
                        batch_size = 256,
                        epochs = EPOCHS,
                        validation_data = val_dataset,
                        callbacks = [reduce_lr])
    
    return history, model



history, mymodel = build_model(train_dataset = train_dataset,
                              val_dataset = val_dataset,
                              input_size = NUMBER_FEATURE)

In [None]:
def plot_metrics(history,epochs):
    
    titles = ['Training loss', 'Validation loss', 'Training Accuracy','Validation Accuracy']
    metrics = ['loss', 'val_loss', 'accuracy','val_accuracy']
    palette = Inferno[4]
    figures = []
    
    for k in range(4):
        
        p = figure( width = 600, height = 400, title = titles[k])
        
        p.line(np.arange(epochs), history.history[metrics[k]], line_width = 4, color = palette[k%2+1])   
        
        figures.append(p)
    
    show(grid([figures[:2], figures[2:]]))

plot_metrics(history,EPOCHS)


- By increasing the batch_size, validation loss is much less volatile

In [None]:
p = figure( width = 600, height = 400, title = 'Learning Rate Evolution when using ReduceLROnPlateau')
        
p.line(np.arange(EPOCHS), history.history['lr'], line_width = 4, color = Inferno[4][2])   

show(p)

## Correlation and feature importance

---


Checking features impact is always good, and might help to remove some useless features

In [None]:
normalised_df['target'] = train_target


corr_df = normalised_df.corr()


In [None]:

corr_df.index.name = 'Features1'
corr_df.columns.name = 'Features2'


corr_matrix = pd.DataFrame(corr_df.stack(), columns=['correlation']).reset_index()


mapper = LinearColorMapper(palette=Viridis256,
                           low=corr_matrix['correlation'].min(),
                           high=corr_matrix['correlation'].max())



TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"

p = figure(title="Correlation Matrix",
           x_range=corr_matrix['Features1'].drop_duplicates().to_list(),
           y_range=corr_matrix['Features1'].drop_duplicates().to_list(),
           x_axis_location="below",
           width=1200,
           height=1200,
           tools=TOOLS,
           toolbar_location='left')

p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "7px"
p.axis.major_label_standoff = 0
p.xaxis.major_label_orientation = np.pi / 3

p.rect(x='Features1', y="Features2", width=1, height=1,
       source=corr_matrix,
       fill_color={'field': 'correlation', 'transform': mapper},
       line_color=None)

color_bar = ColorBar(color_mapper=mapper,
                     major_label_text_font_size="7px",
                     ticker=BasicTicker(desired_num_ticks=256),
                     border_line_color=None)
p.add_layout(color_bar, 'right')

show(p)


In [None]:
#removing target
normalised_df.drop(columns = ['target'],inplace = True)

Apparently features are not correlated. We just need to find the most useful features, to simplify our model (not done yet)

In [None]:
normalised_test_df = (test_df - means_df) /std_df

#we use means and standard deviation of training set, so the distributions are matching with the ones used during training


ids = normalised_test_df.index

In [None]:
predictions = mymodel.predict(np.asarray(normalised_test_df), 
               batch_size = 128,
               verbose = 1)



In [None]:
submission_df = pd.DataFrame(data = {'id' : ids, 'target' : predictions.round().reshape(-1,)}).set_index('id')

submission_df.to_csv('submission.csv')