In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import warnings                    
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel("Ecommerce_Cleaned_Data.xlsx")
df

Unnamed: 0,Category,Price,Discount,Final_Price,Payment_Method
0,5,36.53,15,31.05,3
1,2,232.79,20,186.23,3
2,5,317.02,25,237.76,1
3,6,173.19,25,129.89,4
4,0,244.80,20,195.84,3
...,...,...,...,...,...
3655,0,486.79,0,486.79,4
3656,6,212.87,15,180.94,0
3657,4,389.76,0,389.76,3
3658,3,447.66,30,313.36,4


**Data Modelling**

In [3]:
X = df.drop(columns='Discount')
y = df['Discount']

**Train-Test Split**

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10 )

**Modelling**

In [6]:
from sklearn.linear_model import Ridge

ridge_base = Ridge()
ridge_base.fit(X_train, y_train)

**Predictions**

In [8]:
train_predictions = ridge_base.predict(X_train)
test_predictions = ridge_base.predict(X_test)

**Evaluation**

In [9]:

print("Train R2:", ridge_base.score(X_train, y_train))
print("Test R2:", ridge_base.score(X_test, y_test))

from sklearn.model_selection import cross_val_score
print("Cross Validation Score:", cross_val_score(ridge_base, X, y, cv=5).mean())

Train R2: 0.7631505249430042
Test R2: 0.7643534170750234
Cross Validation Score: 0.7626555161453462


**Hyperparameter Tuning**

In [15]:
from sklearn.model_selection import GridSearchCV

# Model
estimator = Ridge()

# Parameters & values
param_grid = {"alpha": [0.1, 0.2, 0.5, 0.7, 1, 10, 50, 100, 1000, 10000]}

# Identifying the best value of the parameter within given values for the given data
model_hp = GridSearchCV(estimator, param_grid, cv=5, scoring='neg_mean_squared_error')
model_hp.fit(X_train, y_train)

# Displaying the best parameters
model_hp.best_params_


{'alpha': 10000}

* The definite alpha value cannot be determined hence we will continue with current alpha value.

**Rebuilding model with ideal parameter**

In [16]:
# Modelling

ridge_best = Ridge(alpha=10000)
ridge_best.fit(X_train, y_train)

# Predictions
train_predictions = ridge_best.predict(X_train)
test_predictions = ridge_best.predict(X_test)

# Evaluation
from sklearn.model_selection import cross_val_score

print("Train R2:", ridge_best.score(X_train, y_train))
print("Test R2:", ridge_best.score(X_test, y_test))
print("Cross Validation Score:", cross_val_score(ridge_best, X, y, cv=5).mean())


Train R2: 0.763137976668051
Test R2: 0.7640977909436479
Cross Validation Score: 0.7629914965975356


**Prediction on New Data**

In [17]:
input_data = pd.DataFrame( {'Category':[5,1,2,3], 'Price':[36.50, 365.90, 109.26, 890.56 ], 'Final_Price': [35.20, 228.6, 89.50, 459.23 ], 'Payment_Method': [0,2,1,4] } )
input_data

Unnamed: 0,Category,Price,Final_Price,Payment_Method
0,5,36.5,35.2,0
1,1,365.9,228.6,2
2,2,109.26,89.5,1
3,3,890.56,459.23,4


In [18]:
ridge_best.predict(input_data)

array([17.42842399, 38.81231444, 18.72565844, 95.73865162])