In [19]:
# import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow import keras

In [20]:
# import dataset

DF_PATH = '/home/rkfd/code/mlp-configurations/online_shop_train.csv'
df = pd.read_csv(DF_PATH)

In [21]:
# display the dimensions of the dataset (rows, columns)
df.shape

(16626, 13)

In [22]:
# display the first 10 rows in the dataset
df.head(10)

# display the last 10 rows in the dataset
# df.tail(10)

Unnamed: 0,Administrative,Informational,ProductRelated,ExitRates,PageValues,SpecialDay,Month,Region,TrafficType,VisitorType0,VisitorType1,Weekend,Revenue
0,0,0,12,0.069444,0.0,0.8,5,2,2,0.0,1.0,0,0
1,0,0,44,0.032143,0.0,0.0,12,1,1,0.0,1.0,0,0
2,2,0,7,0.011111,0.0,0.0,3,8,2,0.0,0.0,0,0
3,3,0,37,0.004274,47.931282,0.0,5,3,2,0.0,0.0,1,1
4,0,0,1,0.2,0.0,0.0,5,3,3,0.0,1.0,1,0
5,3,0,9,0.045455,0.0,0.0,9,6,13,0.0,1.0,0,0
6,8,1,19,0.021816,21.318,0.0,5,4,2,0.0,1.0,1,1
7,12,3,336,0.013316,1.17899,0.0,11,1,11,0.0,1.0,0,1
8,1,0,6,0.044444,0.0,0.0,11,3,3,0.0,1.0,0,0
9,0,0,39,0.028205,0.0,0.4,5,9,3,0.0,1.0,0,0


In [23]:
# display math and distribution statistics for the dataset
df.describe()

Unnamed: 0,Administrative,Informational,ProductRelated,ExitRates,PageValues,SpecialDay,Month,Region,TrafficType,VisitorType0,VisitorType1,Weekend,Revenue
count,16626.0,16626.0,16626.0,16626.0,16626.0,16626.0,16626.0,16626.0,16626.0,16626.0,16626.0,16626.0,16626.0
mean,2.297907,0.457296,37.82395,0.033312,14.074354,0.038374,8.129737,2.838807,3.556358,0.004451,0.82359,0.238181,0.5
std,3.168297,1.160715,50.922376,0.040125,27.136885,0.159034,3.354549,2.277844,3.514761,0.066568,0.38118,0.425983,0.500015
min,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,10.0,0.012372,0.0,0.0,5.0,1.0,2.0,0.0,1.0,0.0,0.0
50%,1.0,0.0,21.0,0.020534,0.0,0.0,10.0,2.0,2.0,0.0,1.0,0.0,0.5
75%,4.0,0.0,44.0,0.03515,17.857937,0.0,11.0,4.0,4.0,0.0,1.0,0.0,1.0
max,27.0,24.0,705.0,0.2,361.763742,1.0,12.0,9.0,20.0,1.0,1.0,1.0,1.0


In [24]:
# display the non-null count and datatypes for all the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16626 entries, 0 to 16625
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Administrative  16626 non-null  int64  
 1   Informational   16626 non-null  int64  
 2   ProductRelated  16626 non-null  int64  
 3   ExitRates       16626 non-null  float64
 4   PageValues      16626 non-null  float64
 5   SpecialDay      16626 non-null  float64
 6   Month           16626 non-null  int64  
 7   Region          16626 non-null  int64  
 8   TrafficType     16626 non-null  int64  
 9   VisitorType0    16626 non-null  float64
 10  VisitorType1    16626 non-null  float64
 11  Weekend         16626 non-null  int64  
 12  Revenue         16626 non-null  int64  
dtypes: float64(5), int64(8)
memory usage: 1.6 MB


In [25]:
# convert VisitorType-0&1 to integer

df.VisitorType0 = df.VisitorType0.astype(int)
df.VisitorType1 = df.VisitorType1.astype(int)

In [26]:
# split dataset into training, validation and test sets

df_train_valid, df_test = train_test_split(df, test_size=0.2, random_state=0)
df_train, df_valid = train_test_split(df_train_valid, test_size=0.2, random_state=0)

In [27]:
df_test.shape

(3326, 13)

In [28]:
df_train.shape

(10640, 13)

In [29]:
df_valid.shape

(2660, 13)

In [30]:
# split train set into independent and dependent variables

x_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

In [31]:
# split validation set into independent and dependent variables

x_valid = df_valid.iloc[:,:-1]
y_valid = df_valid.iloc[:,-1]

In [32]:
# split test set into indepdendent and dependent variables

x_test = df_test.iloc[:,:-1]
y_test = df_test.iloc[:,-1]

In [33]:
# set seeds

np.random.seed(42)
tf.random.set_seed(42)

In [34]:
# apply onehot encoding on the target variable
y_train = keras.utils.to_categorical(y_train, num_classes=None, dtype='float32')
y_train.shape

(10640, 2)

In [35]:
# apply onehot encoding on the tagret variable
y_valid = keras.utils.to_categorical(y_valid, num_classes=None, dtype='float32')
y_valid.shape

(2660, 2)

In [36]:
# apply onehot encoding on the target variable
y_test = keras.utils.to_categorical(y_test, num_classes=None, dtype='float32')
y_test.shape

(3326, 2)