In [1]:
import pandas as pd
import numpy as np

In [2]:
INPUT_FILE = '../dataset/flipkart_products_preprocessed_2.csv'

In [3]:
df = pd.read_csv(INPUT_FILE)

In [4]:
df["text"] = (
    df["title"].fillna("") + " " +
    df["description"].fillna("") + " " +
    df["level_combined"].fillna("")
)

In [5]:
features = [
    "price",
    "rating_value",
    "level_2",
    "level_3", 
    "text"
]

X = df[features]
y = df["discount"]

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from category_encoders.target_encoder import TargetEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [7]:
numeric_features = [
    "price", 
    "rating_value"
]

categorical_features = [
    "level_2",
    "level_3"
]

text_features = [
    "text"
]

In [8]:
numeric_transformer = StandardScaler()
categorical_transformer = TargetEncoder()

text_tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("text_tfidf", text_tfidf, "text")
    ],
    remainder="drop"
)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
models = { 
    "LinearRegression": LinearRegression(), 
    "Ridge": Ridge(alpha=1.0), 
    "Lasso": Lasso(alpha=0.001, max_iter=5000), 
    "ElasticNet": ElasticNet(alpha=0.001, l1_ratio=0.5, max_iter=5000) 
}

In [12]:
for name, model in models.items(): 
    print(f"\n===== {name} =====") 
    
    pipeline = Pipeline(
        steps = [ 
                ("preprocess", preprocessor),
                ("model", model)
            ]
    ) 
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print("RMSE:", rmse)
    print("R^2:", r2)


===== LinearRegression =====
RMSE: 13.764648710682224
R^2: 0.49154022273551257

===== Ridge =====
RMSE: 13.737962730565199
R^2: 0.4935098471990681

===== Lasso =====
RMSE: 14.079291289500897
R^2: 0.4680290376345433

===== ElasticNet =====
RMSE: 14.636836224883824
R^2: 0.425062329004012


In [13]:
from sklearn.model_selection import RandomizedSearchCV

In [14]:
ridge_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", Ridge())
])

In [None]:
param_grid = {
    "model__alpha": [0.01, 0.1, 1.0, 10, 50, 100],
    "model__fit_intercept": [True, False],
    "model__solver": ["auto", "svd", "lsqr", "sag"],
    "preprocess__text_tfidf__max_features": [3000, 5000, 8000],
    "preprocess__text_tfidf__ngram_range": [(1,1), (1,2)],
    "preprocess__text_tfidf__min_df": [1, 2, 5],
}

In [16]:
grid = RandomizedSearchCV(
    ridge_pipeline,
    param_distributions=param_grid,
    n_iter=50,
    scoring="neg_root_mean_squared_error",
    cv=2,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [17]:
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best RMSE:", -grid.best_score_)

In [18]:
best_ridge = grid.best_estimator_

y_pred = best_ridge.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R^2:", r2)

RMSE: 13.34083811693004
R^2: 0.522368933176386
