Trusted
Jupyter Server: local
Python 3: Idle





row_num: a number uniquely identifying each row.

• locale: the platform of the session.

• day_of_week: Mon-Fri, the day of the week of the session.

• hour_of_day: 00-23, the hour of the day of the session.

• agent_id: the device used for the session.

• entry_page: describes the landing page of the session.

• path_id_set: shows all the locations that were visited during the session.

• traffic_type: indicates the channel the user cane through eg. search engine, email, ...

• session_duration: the duration in seconds of the session.

• hits: the number of interactions with the trivago page during the session.

Task: Note that the column “hits” has missing values. Use this data to build a model that predicts the number of hits per session, depending on the given parameters.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
import numpy.random as nr
import scipy.stats as ss
import math

In [None]:
def hist_plot(df,cols):
    """
    Plots a histogram of the desired numerical data

    Args:
        vals: data_frame column
        lab: string containing the label
    """
    for col in cols:
        sns.distplot(df[col])
        plt.title('Histogram of ' + col)
        plt.xlabel('Value')
        plt.ylabel('Density')
        plt.show()


def plot_box(data, col, col_y):
    """
    data is a pandas dataframe
    col is the column with the categorical values
    col_y is the column with the quantitative values
    """
    sns.set_style("whitegrid")
    sns.boxplot(col, col_y, data = data)
    plt.xlabel(col) 
    plt.ylabel(col_y)

def frequency_table(data, cols):
    """
    Builds a frequency table for the specific columns
    """
    for col in cols:
        print('\n' + 'For column ' + col)
        print(data[col].value_counts())
        print('There are %s unique values' %data[col].unique().shape[0])

def plot_bars(df, cols):
    """
    Bars for categorical variables
    df: data frame
    cols: columns to plot
    """
    for col in cols:
        fig = plt.figure(figsize=(6,6)) 
        ax = fig.gca()    
        counts = df[col].value_counts() 
        counts.plot.bar(ax = ax, color = 'blue') 
        ax.set_title('Counts' + col) 
        ax.set_xlabel(col) 
        ax.set_ylabel('Counts')
        plt.show()
    
    return fig , ax 

def plot_scatter(df, cols, col_y , alpha = 1.0):
    for col in cols:
        fig = plt.figure(figsize=(7,6)) 
        ax = fig.gca() 
        df.plot.scatter(x = col, y = col_y, ax = ax, alpha = alpha)
        ax.set_title('Scatter plot of ' + col_y + ' vs. ' + col) 
        ax.set_xlabel(col) 
        ax.set_ylabel(col_y)
        plt.show()

def path_splitter(data):
    """
    Return the number of path_id
    """

    if isinstance(data, str):
        n_ids = len(data.split(';'))
    elif math.isnan(data):
        n_ids = 0
    else:
        n_ids = 1
    
    return n_ids


def plot_box(df, col, col_y = 'hits'):
    fig, ax = plt.subplots()
    sns.set_style("whitegrid")
    sns.boxplot(col, col_y, data=df,ax = ax)
    plt.xlabel(col) 
    plt.ylabel(col_y)
    plt.show()

    return fig, ax

def group_cat(df, col, threshold ):
    """
    Groups categorical variables into a group called "other"
    Args:
    df:data_frame
    col:column to group
    threshold: threshold to decide if the variable is grouped or not
    """
    frequencies = pd.DataFrame(df[col].value_counts())
    frequencies = frequencies.reset_index()
    frequencies.columns = [col, 'frequency']
    total = frequencies['frequency'].sum()
    frequencies['perc']= frequencies['frequency']/total
    group = frequencies[col].loc[frequencies['perc']<threshold].values

    return group, frequencies

def cat_aggregation(x,group):
    """
    Assigns the variable "other" to the variables in the group
    """
    if x in group:
        x = 'other'
    else: 
        x = str(x)
    return x

def encode_string(cat_feature):
    """
    Creates dummy variables out of categorical features and encodes them. 
    """
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_feature)
    enc_cat_feature = enc.transform(cat_feature)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_feature.reshape(-1,1))
    return encoded.transform(enc_cat_feature.reshape(-1,1)).toarray()

def print_metrics(y_true, y_predicted):
    ## First compute R^2 and the adjusted R^2
    r2 = sklm.r2_score(y_true, y_predicted)
    
    ## Print the usual metrics and the R^2 values
    print('Mean Square Error      = ' + str(sklm.mean_squared_error(y_true, y_predicted)))
    print('Root Mean Square Error = ' + str(math.sqrt(sklm.mean_squared_error(y_true, y_predicted))))
    print('Mean Absolute Error    = ' + str(sklm.mean_absolute_error(y_true, y_predicted)))
    print('Median Absolute Error  = ' + str(sklm.median_absolute_error(y_true, y_predicted)))
    print('R^2                    = ' + str(r2))

def hist_resids(y_test, y_score):
    ## first compute vector of residuals. 
    resids = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))
    ## now make the residual plots
    sns.distplot(resids)
    plt.title('Histogram of residuals')
    plt.xlabel('Residual value')
    plt.ylabel('count')

def resid_qq(y_test, y_score):
    ## first compute vector of residuals. 
    resids = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))
    ## now make the residual plots
    ss.probplot(resids.flatten(), plot = plt)
    plt.title('Residuals vs. predicted values')
    plt.xlabel('Predicted values')
    plt.ylabel('Residual')

def resid_plot(y_test, y_score):
    ## first compute vector of residuals. 
    resids = np.subtract(y_test.reshape(-1,1), y_score.reshape(-1,1))
    ## now make the residual plots
    sns.regplot(y_score, resids, fit_reg=False)
    plt.title('Residuals vs. predicted values')
    plt.xlabel('Predicted values')
    plt.ylabel('Residual')

# Data cleaning and basic statistics

In [None]:
df_original = pd.read_csv('MLDataScientistCaseStudyData2020.csv')
df_original.columns = ["row_num","locale","day_of_week","hour_of_day","agent_id","entry_page","path_id","traffic_type","session_duration","hits"]
df_original.head()

Checking if there are wrong cells

In [None]:
(df_original.astype(np.object) == '?').any()

After performing a quick search on trivago, checking the source code, "locale" seems to be related with the website, eg=[UK,DE,IT] etc

In [None]:
df_original = pd.read_csv('MLDataScientistCaseStudyData2020.csv')
df_original.columns = ["row_num","locale","day_of_week","hour_of_day","agent_id","entry_page","path_id","traffic_type","session_duration","hits"]


df_goal = df_original.loc[df_original['hits'] == '\\N'] 
df_test = df_original.loc[df_original['hits']!= '\\N']

# Now I can cast the hits into int32
df_test['hits'] = df_test['hits'].astype('int64')

# Deleting a couple of missing session duration
df_test = df_test.loc[df_test['session_duration'] != '\\N'] 
df_test['session_duration'] = df_test['session_duration'].astype('int64')

# Working on 5% of the data

In [None]:
frac = 0.8
df_reduced = df_test.sample(frac = 0.8)
df_test.dtypes


In [None]:
df_complete = df_reduced[df_reduced['path_id'].notna()]
df_incomplete = df_reduced[df_reduced['path_id'].isna()]

In [None]:
# Define which are the categorical variables
cat_cols = [x  for x in df_complete.columns if pd.api.types.is_string_dtype(df_complete[x])]
print(cat_cols)

# Define which are the Numerical variables
numeric_cols = [x  for x in df_complete.columns if pd.api.types.is_numeric_dtype(df_complete[x])]
print(numeric_cols)

In [None]:
frequency_table(df_complete, df_complete.columns)


Special care is required with path_id as the values have strings and floats. Moreover there are some rows without any value. I will assume that if there is non. The empty are floats

In [None]:
df_complete['n_ids'] = df_complete['path_id'].apply(lambda x:path_splitter(x))
df_complete.dtypes

In [None]:
frequency_table(df_complete, ["n_ids"])

In [None]:
df_complete['n_ids'].describe()

Need to explore what happens with those with missing path_id...
Fist lets find out how many are there, it is just 2 percent. therefore, maybe I can find a relation with others such that I could fill the missing values.

In [None]:
n_missing = df_test['path_id'].isnull().sum()
print(n_missing/len(df_test.index))

n_missing_goal = df_goal['path_id'].isnull().sum()
print(n_missing_goal/len(df_goal.index))


BUilding a subset of the data that is complete, to study if I can perform any smart way of filling the gaps.
using nearest neighbor to forward or
backward fill those missing values.
You can impute missing values by various methods,
you can use mean, median.
You could do some sort of interpolation
like a trend value and there's
much more sophisticated methods out there as
well to impute missing values.
For machine learning, the relationship of greatest interest is between the features and the label. It can also be useful to examine the relationships between features to determine if the features are co-variate or not. Such a procedure can prove more reliable than simply computing correlation when the relationship is nonlinear.

# Data exploration

# Categorical variables


In [None]:
sns.countplot(x="locale", data=df_complete)

In [None]:
fig, ax = plot_box(df_complete, 'locale')  

In [None]:
ax.set_ylim(0,100)
fig

# Traffic type

In [None]:
sns.countplot(x="traffic_type", data=df_complete)


In [None]:
fig, ax = plot_box(df_complete, 'traffic_type')  

In [None]:
ax.set_ylim(0,100)
fig

Aggregating categories of categorical variables to reduce the number. Categorical features or labels with too many unique categories will limit the predictive power of a machine learning model. Aggregating categories can improve this situation, sometime greatly. However, one must be careful. It only makes sense to aggregate categories that are similar in the domain of the problem. Thus, domain expertise must be applied.

In [None]:
group, frequency = group_cat(df_complete, 'traffic_type', 0.1)
df_complete['traffic_type_aggr'] = df_complete['traffic_type'].apply(lambda x:cat_aggregation(x,group))
sns.countplot(x="traffic_type_aggr", data=df_complete)

In [None]:
fig, ax = plot_box(df_complete, 'traffic_type_aggr') 

In [None]:
ax.set_ylim(0,100)
fig

# Agent Id

In [None]:
sns.countplot(x="agent_id", data=df_complete)

In [None]:
fig, ax = plot_box(df_complete, 'agent_id')  

In [None]:
ax.set_ylim(0,100)
fig

In [None]:
group, frequency = group_cat(df_complete, 'agent_id', 0.03)
df_complete['agent_aggr'] = df_complete['agent_id'].apply(lambda x:cat_aggregation(x,group))
sns.countplot(x="agent_aggr", data=df_complete)

In [None]:
fig, ax = plot_box(df_complete, 'agent_aggr')  

In [None]:
ax.set_ylim(0,100)
fig

# Entry Page

In [None]:
sns.countplot(x="entry_page", data=df_complete)

In [None]:
group, frequency = group_cat(df_complete, 'entry_page', 0.01)
df_complete['entry_aggr'] = df_complete['entry_page'].apply(lambda x:cat_aggregation(x,group))
sns.countplot(x="entry_aggr", data=df_complete)v

In [None]:
fig, ax = plot_box(df_complete, 'entry_aggr')  

In [None]:
ax.set_ylim(0,100)
fig

This one might have an influence as the medians and some of them are skewed

# Transforming numeric variables
To improve performance of machine learning models transformations of the values are often applied. Typically, transformations are used to make the relationships between variables more linear. In other cases, transformations are performed to make distributions closer to Normal, or at least more symmetric. These transformations can include taking logarithms, exponential transformations and power transformations. 

## Session duration


In [None]:
hist_plot(df_complete, ['session_duration'])

In [None]:
df_complete['session_duration'].value_counts()

As there are some values with 0 seconds, actually most of them, I will apply log(x+1)

In [None]:
df_complete[['log_duration']] = df_complete[['session_duration']].applymap(np.log1p)
hist_plot(df_complete, ['log_duration'])

## n_ids

In [None]:
hist_plot(df_complete, ['n_ids'])

In [None]:
df_complete[['log_n_ids']] = df_complete[['n_ids']].applymap(math.log)
hist_plot(df_complete, ['log_n_ids'])

## Time

In [None]:
hist_plot(df_complete, ['hour_of_day'])

Turning ingo a cyclical variable

In [None]:
df_complete['sin_hour'] = np.sin(2*np.pi*df_complete['hour_of_day']/24)
df_complete['cos_hour'] = np.cos(2*np.pi*df_complete['hour_of_day']/24)
hist_plot(df_complete, ['sin_hour'])

## Day of the week


In [None]:
dict_day =  {'Monday':0, 'Tuesday':1, 'Wednesday':2, 'Thursday':3, 'Friday':4, 'Saturday':5,'Sunday':6}
df_complete['day_of_week_num'] = [dict_day[x] for x in df_complete['day_of_week']]

In [None]:
df_complete['sin_day'] = np.sin(2*np.pi*df_complete['day_of_week_num']/7)
df_complete['cos_day'] = np.cos(2*np.pi*df_complete['day_of_week_num']/7)
df_complete.plot(x = 'day_of_week_num', y = 'cos_day', kind='scatter')

## Hits(Label)

In [None]:
hist_plot(df_complete, ['hits'])


In [None]:
df_complete[['log_hits']] = df_complete[['hits']].applymap(math.log)
hist_plot(df_complete, ['log_hits'])