In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, KBinsDiscretizer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


df_train = pd.read_csv('ecommerce_data_train.csv')
df_test = pd.read_csv('ecommerce_data_val.csv')


In [None]:
# Splitting the data into features and target variable
X_train = df_train.drop('Monthly Revenue', axis = 1)
y_train = df_train['Monthly Revenue']

X_test = df_test.drop('Monthly Revenue', axis = 1)
y_test = df_test['Monthly Revenue']


In [None]:
# Question 1
#
# Train a model using all features except for season - use all features linearly
#
Xq1_train = X_train.copy()
Xq1_test = X_test.copy()

Xq1_train = Xq1_train.drop('Season', axis = 1)   # this drops Season from the feature array
Xq1_test = Xq1_test.drop('Season', axis = 1)

# Now train with (y_train,Xq1_train)
# YOUR CODE HERE...
# 


In [None]:
# Question 2
#
# Now add season to the model as a onehot coded feature - do you do better on test data?
#
#
Xq2_train = X_train.copy()
Xq2_test = X_test.copy()

one_hot_encoder = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), ['Season'])],
                                     remainder='passthrough')
Xq2_train_encoded = one_hot_encoder.fit_transform(Xq2_train)
Xq2_test_encoded = one_hot_encoder.transform(Xq2_test)

# YOUR CODE HERE...
# 



In [None]:
# Question 3: Two models with 'Number of Website Visits' as numerical and categorical

# Numerical model
#
# this is just the same as in question 1

# Categorical model (using KBinsDiscretizer for binning)
Xq3_train = Xq1_train.copy()
Xq3_test = Xq1_test.copy()


# Pipeline for discretizing and then one-hot encoding
pipe = Pipeline([
    ('kbins', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform', subsample=None)),
    ('onehot', OneHotEncoder(drop='first'))
])

# ColumnTransformer
transformer = ColumnTransformer(
    transformers=[
        ('discretize_and_encode', pipe, ['Number of Website Visits'])
    ],
    remainder='passthrough'
)

Xq3_train_encoded = transformer.fit_transform(Xq3_train)
Xq3_test_encoded = transformer.transform(Xq3_test)


# YOUR CODE HERE...
# 


In [None]:
# Question 4: Try to discretize Average Order Value


# Categorical model (using KBinsDiscretizer for binning)
Xq4_train = Xq1_train.copy()
Xq4_test = Xq1_test.copy()


# Pipeline for discretizing and then one-hot encoding
pipe = Pipeline([
    ('kbins', KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform', subsample=None)),
    ('onehot', OneHotEncoder(drop='first'))
])

# ColumnTransformer
transformer = ColumnTransformer(
    transformers=[
        ('discretize_and_encode', pipe, ['Average Order Value'])
    ],
    remainder='passthrough'
)

Xq4_train_encoded = transformer.fit_transform(Xq4_train)
Xq4_test_encoded = transformer.transform(Xq4_test)

# YOUR CODE HERE...
# 


In [None]:
# Question 5

Xq5_train = Xq1_train.copy()
Xq5_test = Xq1_test.copy()

# create interaction
Xq5_train['ItemsSold_AvgOrder'] = Xq5_train['Average Order Value']*Xq5_train['Number of Items Sold']
Xq5_test['ItemsSold_AvgOrder'] = Xq5_test['Average Order Value']*Xq5_test['Number of Items Sold']


# YOUR CODE HERE...
# 