In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

import warnings                    
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_excel("Ecommerce_Cleaned_Data.xlsx")
df

Unnamed: 0,Category,Price,Discount,Final_Price,Payment_Method
0,5,36.53,15,31.05,3
1,2,232.79,20,186.23,3
2,5,317.02,25,237.76,1
3,6,173.19,25,129.89,4
4,0,244.80,20,195.84,3
...,...,...,...,...,...
3655,0,486.79,0,486.79,4
3656,6,212.87,15,180.94,0
3657,4,389.76,0,389.76,3
3658,3,447.66,30,313.36,4


**Data Modelling**

In [3]:
X = df.drop(columns='Discount')
y = df['Discount']

**Train-Test Split**

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=10 )

**Modelling (Non-Linear Regression)**

In [6]:
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

polynomial_converter = PolynomialFeatures(degree=2, include_bias=False)
X_train = pd.DataFrame(polynomial_converter.fit_transform(X_train))
X_test = pd.DataFrame(polynomial_converter.transform(X_test))


In [7]:
from sklearn.linear_model import LinearRegression

# Save the model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Print Intercept and Coefficients
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Intercept: 19.663480251639626
Coefficients: [-6.70529714e-02  6.43084200e-01 -7.95227117e-01 -4.54529227e-01
  1.30989593e-02 -4.07423920e-04  1.65182637e-04  4.91940278e-02
 -1.03884901e-03  1.25210885e-03 -2.91122909e-04  3.89514018e-05
  9.13389077e-04  2.85026619e-02]


**Prediction**

In [8]:
# prediction on train data
train_pred = model.predict(X_train)

# prediction on test data
test_pred = model.predict(X_test)

**Evaluation**

In [9]:
# Train R2
print("Train R2:", model.score(X_train, y_train))

# Test R2
print("Test R2:", model.score(X_test, y_test))

#CV Score
from sklearn.model_selection import cross_val_score
print("Cross Validation Score:", cross_val_score(model, X_train, y_train, cv=5).mean())

Train R2: 0.9013776693185755
Test R2: 0.9077828061760278
Cross Validation Score: 0.899901209590834


**Hyperparameter Tuning**

In [17]:
train_r2 = []
test_r2 = []

for i in range(1, 10):
    from sklearn.preprocessing import PolynomialFeatures
    polynomial_converter = PolynomialFeatures(degree=i, include_bias=False)
    X_train_poly = pd.DataFrame(polynomial_converter.fit_transform(X))
   

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X_train_poly, y, test_size=0.2, random_state=10)

    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    model.fit(X_train, y_train)

    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    train_r2.append(model.score(X_train, y_train))
    test_r2.append(model.score(X_test, y_test))

In [18]:
train_r2

[0.7631505249434279,
 0.9013776693185755,
 0.9497170571385872,
 0.974022578877891,
 0.9853386737607939,
 0.9380500105913366,
 0.8364842657351766,
 0.5942925880035542,
 0.6875673522302659]

In [19]:
test_r2

[0.7643534445694036,
 0.9077828061760278,
 0.9538135853782796,
 0.9772399023888922,
 0.9853890422236335,
 0.9338753274315209,
 0.8465196058899829,
 0.45830356888317614,
 0.6145350750701217]

* We can see degree 5 gives the highest accuracy for Train and Test R2(R-square)

**Rebuilding the Model with best parameters**

In [24]:
# preprocessing
final_poly_converter = PolynomialFeatures(degree=5, include_bias=False)
X_poly = final_poly_converter.fit_transform(X)

# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=10)

# build the final model
final_model = LinearRegression()
final_model.fit(X_train, y_train)

# prediction
train_pred = final_model.predict(X_train)
test_pred = final_model.predict(X_test)

# evaluation
print("Train R2:", final_model.score(X_train, y_train))  # Train R2
print("Test R2:", final_model.score(X_test, y_test))    # Test R2

#CV Score
from sklearn.model_selection import cross_val_score
print("Cross Validation Score:", cross_val_score(model, X_train, y_train, cv=5).mean())

Train R2: 0.9853386737607939
Test R2: 0.9853890422236335
Cross Validation Score: 0.9840185971470836


**Prediction on New Data**

In [25]:
input_data = pd.DataFrame( {'Category':[5,1,2,3], 'Price':[36.50, 365.90, 109.26, 890.56 ], 'Final_Price': [35.20, 228.6, 89.50, 459.23 ], 'Payment_Method': [0,2,1,4] } )
input_data

Unnamed: 0,Category,Price,Final_Price,Payment_Method
0,5,36.5,35.2,0
1,1,365.9,228.6,2
2,2,109.26,89.5,1
3,3,890.56,459.23,4


In [26]:
# preprocessing
transformed_data = final_poly_converter.transform(input_data)

# predict
final_model.predict(transformed_data)

array([4.78542255e+00, 3.86066476e+01, 1.83685569e+01, 6.45166885e+03])