## Objective : In this Notebook we will perform data transformation as per requirement

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sidetable

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler , StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

%load_ext autoreload
%autoreload 2

  import pandas.util.testing as tm


### Import Function File

In [8]:
import Functions_File as my_ff

Functions_File is successfully imported.


#  Transform  Data

In [49]:
train_data=pd.read_csv(r'C:\Users\psahu\Documents\My_projects\online_shoppers\online_shoppers_intention_modified.csv' )

In [50]:
train_data.shape

(12330, 18)

In [51]:
train_data.columns

Index(['Administrative', 'Administrative_Duration', 'Informational',
       'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
       'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month',
       'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType',
       'Weekend', 'Revenue'],
      dtype='object')

Note : We will  treat below columns as categorical features:
'SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType'

In [52]:
train_data[['SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType','Revenue']] = train_data[['SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType','Revenue']].astype('str')

In [53]:
#Drop Co-related Features
train_data.drop(['Administrative',  'Informational',  'ProductRelated',  'BounceRates'] ,axis =1 ,inplace = True)

In [54]:
train_data.columns

Index(['Administrative_Duration', 'Informational_Duration',
       'ProductRelated_Duration', 'ExitRates', 'PageValues', 'SpecialDay',
       'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType',
       'VisitorType', 'Weekend', 'Revenue'],
      dtype='object')

### Save Target Feature

In [55]:
Target_var = train_data['Revenue']
train_data.drop('Revenue' ,axis=1 ,inplace = True)

### Feature Engineering 

In [56]:
#Check columns types (change below features into  categorical)

In [57]:
train_data[['SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType']] = train_data[['SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType']].astype('str')

In [58]:
numerical_feats ,categorical_feats = my_ff.column_type(train_data)

print('-------------------------------------------')
print('Numeric Features:'  , list(numerical_feats))
print('-------------------------------------------')
print('Categorical Features:'  , list(categorical_feats))

Number of Numerical features:  5
Number of Categorical features:  8
-------------------------------------------
Numeric Features: ['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration', 'ExitRates', 'PageValues']
-------------------------------------------
Categorical Features: ['SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']


### Dummy Encoding of Categorical Variable

In [59]:
Encoded_vars_train = pd.get_dummies(train_data[categorical_feats] , drop_first =True)

In [60]:
train_data = pd.concat([train_data ,Encoded_vars_train ],axis =1)

In [61]:
train_data.drop(categorical_feats , axis =1 , inplace = True)
train_data

Unnamed: 0,Administrative_Duration,Informational_Duration,ProductRelated_Duration,ExitRates,PageValues,SpecialDay_0.2,SpecialDay_0.4,SpecialDay_0.6,SpecialDay_0.8,SpecialDay_1.0,...,TrafficType_3,TrafficType_4,TrafficType_5,TrafficType_6,TrafficType_7,TrafficType_8,TrafficType_9,VisitorType_Other,VisitorType_Returning_Visitor,Weekend_YES
0,0.0,0.0,0.000000,0.200000,0.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0.0,0.0,64.000000,0.100000,0.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0.0,0.0,0.000000,0.200000,0.000000,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
3,0.0,0.0,2.666667,0.140000,0.000000,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,0.0,0.0,627.500000,0.050000,0.000000,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,145.0,0.0,1783.791667,0.029031,12.241717,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
12326,0.0,0.0,465.750000,0.021333,0.000000,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,1
12327,0.0,0.0,184.250000,0.086667,0.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
12328,75.0,0.0,346.000000,0.021053,0.000000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Split and save Data

In [62]:
y = Target_var
X = train_data

In [63]:
#Check : stratify = y for good class distribution
import sklearn.model_selection as model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=1 , stratify = y)
X_train, X_val, y_train, y_val = model_selection.train_test_split(
    X_train, y_train, test_size=0.25, random_state=1 )
 

In [64]:
print(y_train.value_counts(normalize=True)) 
print(y_val.value_counts(normalize=True)) 
print(y_test.value_counts(normalize=True))

0    0.847662
1    0.152338
Name: Revenue, dtype: float64
0    0.8382
1    0.1618
Name: Revenue, dtype: float64
0    0.845093
1    0.154907
Name: Revenue, dtype: float64


In [65]:
X_train.shape , X_test.shape , X_val.shape , y_train.shape , y_test.shape , y_val.shape 

((7398, 68), (2466, 68), (2466, 68), (7398,), (2466,), (2466,))

In [None]:
#This data is ready for Tree - based algorithms

In [66]:
X_train.to_csv(r'C:\Users\psahu\Documents\My_projects\online_shoppers\data\Transformed_X_train_shoppersData.csv' , index =False)
X_test.to_csv(r'C:\Users\psahu\Documents\My_projects\online_shoppers\data\Transformed_X_test_shoppersData.csv' , index =False)
X_val.to_csv(r'C:\Users\psahu\Documents\My_projects\online_shoppers\data\Transformed_X_val_shoppersData.csv' , index =False)
y_train.to_csv(r'C:\Users\psahu\Documents\My_projects\online_shoppers\data\Transformed_y_train.csv' , index =False)
y_test.to_csv(r'C:\Users\psahu\Documents\My_projects\online_shoppers\data\Transformed_y_test.csv' , index =False)
y_val.to_csv(r'C:\Users\psahu\Documents\My_projects\online_shoppers\data\Transformed_y_val.csv' , index =False)