In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import warnings                    
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel("Ecommerce_Cleaned_Data.xlsx")
df

Unnamed: 0,Category,Price,Discount,Final_Price,Payment_Method
0,5,36.53,15,31.05,3
1,2,232.79,20,186.23,3
2,5,317.02,25,237.76,1
3,6,173.19,25,129.89,4
4,0,244.80,20,195.84,3
...,...,...,...,...,...
3655,0,486.79,0,486.79,4
3656,6,212.87,15,180.94,0
3657,4,389.76,0,389.76,3
3658,3,447.66,30,313.36,4


In [4]:
df.columns.tolist()

['Category', 'Price', 'Discount', 'Final_Price', 'Payment_Method']

**Data Modelling**

In [6]:
X = df.drop(columns='Discount')
y = df['Discount']

**Train-Test Split**

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10 )

**Modelling(Linear Regression)**

In [15]:
from sklearn.linear_model import LinearRegression

# Save the model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Print Intercept and Coefficients
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Intercept: 18.967670723780678
Coefficients: [ 0.02215396  0.23852548 -0.2951763   0.03279585]


**Prediction**

In [16]:
# prediction on train data
y_pred_train = model.predict(X_train)

# prediction on test data
y_pred_test = model.predict(X_test)

**Evaluation**

In [17]:
# Train R2
print("Train R2:", model.score(X_train, y_train))

# Test R2
print("Test R2:", model.score(X_test, y_test))

#CV Score
from sklearn.model_selection import cross_val_score
print("Cross Validation Score:", cross_val_score(model, X_train, y_train, cv=5).mean())

Train R2: 0.7631505249434279
Test R2: 0.7643534445694037
Cross Validation Score: 0.7619766096662193


**Variables Significance**

In [18]:
import statsmodels.formula.api as smf

model1 = smf.ols("y ~ X", data=df).fit()
model1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.763
Model:,OLS,Adj. R-squared:,0.763
Method:,Least Squares,F-statistic:,2950.0
Date:,"Sat, 19 Jul 2025",Prob (F-statistic):,0.0
Time:,08:40:14,Log-Likelihood:,-12400.0
No. Observations:,3660,AIC:,24810.0
Df Residuals:,3655,BIC:,24840.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,18.8023,0.350,53.777,0.000,18.117,19.488
X[0],0.0710,0.059,1.195,0.232,-0.046,0.188
X[1],0.2406,0.002,101.457,0.000,0.236,0.245
X[2],-0.2973,0.003,-108.580,0.000,-0.303,-0.292
X[3],0.0161,0.084,0.192,0.848,-0.149,0.181

0,1,2,3
Omnibus:,710.812,Durbin-Watson:,1.931
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2335.854
Skew:,0.97,Prob(JB):,0.0
Kurtosis:,6.4,Cond. No.,1120.0


In [19]:
df

Unnamed: 0,Category,Price,Discount,Final_Price,Payment_Method
0,5,36.53,15,31.05,3
1,2,232.79,20,186.23,3
2,5,317.02,25,237.76,1
3,6,173.19,25,129.89,4
4,0,244.80,20,195.84,3
...,...,...,...,...,...
3655,0,486.79,0,486.79,4
3656,6,212.87,15,180.94,0
3657,4,389.76,0,389.76,3
3658,3,447.66,30,313.36,4


In [20]:
# Calculating VIF values of independent variables

rsq_Category = smf.ols('Category~Price+Final_Price+Payment_Method', data=df).fit().rsquared
vif_Category = 1/(1-rsq_Category)

rsq_Price = smf.ols('Price~Category+Final_Price+Payment_Method', data=df).fit().rsquared
vif_Price = 1/(1-rsq_Price)

rsq_Final_Price = smf.ols('Final_Price~Category+Price+Payment_Method', data=df).fit().rsquared
vif_Final_Price = 1/(1-rsq_Final_Price)

rsq_Payment_Method = smf.ols('Payment_Method~Category+Price+Final_Price', data=df).fit().rsquared
vif_Payment_Method = 1/(1-rsq_Payment_Method)


# Storing VIF values in a data frame

d1 = {'Variables':['Category', 'Price','Final_Price','Payment_Method'], 'VIF': [vif_Category, vif_Price, vif_Final_Price,vif_Payment_Method]}
vif_frame = pd.DataFrame(d1)
vif_frame

Unnamed: 0,Variables,VIF
0,Category,1.000256
1,Price,8.03754
2,Final_Price,8.038386
3,Payment_Method,1.0007


* We will have to drop 'Price' and 'Final_Price' variables. 

In [33]:
df1= df.drop(['Price', 'Final_Price'], axis=1)
df1

Unnamed: 0,Category,Discount,Payment_Method
0,5,15,3
1,2,20,3
2,5,25,1
3,6,25,4
4,0,20,3
...,...,...,...
3655,0,0,4
3656,6,15,0
3657,4,0,3
3658,3,30,4


### Final Model

**Train-Test Split**

In [34]:
X = df1[['Category', 'Payment_Method']]
y = df1['Discount']

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10 )

**Modelling(Linear Regression)**

In [36]:
from sklearn.linear_model import LinearRegression

# Save the model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Print Intercept and Coefficients
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Intercept: 18.607603969798426
Coefficients: [-0.04761443  0.15613433]


**Prediction**

In [37]:
# prediction on train data
y_pred_train = model.predict(X_train)

# prediction on test data
y_pred_test = model.predict(X_test)

**Evaluation**

In [38]:
# Train R2
print("Train R2:", model.score(X_train, y_train))

# Test R2
print("Test R2:", model.score(X_test, y_test))

#CV Score
from sklearn.model_selection import cross_val_score
print("Cross Validation Score:", cross_val_score(model, X_train, y_train, cv=5).mean())

Train R2: 0.00026578791555276204
Test R2: 0.0004844191835359224
Cross Validation Score: -0.001542199242988529


* Dropping the variables leads to significant drop in accuracy, hence we will go with the original DataFrame and model. 

* Train R2: 0.7631505249434279
* Test R2: 0.7643534445694037
* Cross Validation Score: 0.7619766096662193

This can be considered a good model. 