In [1]:
import pandas as pd
import numpy as np

In [2]:
INPUT_FILE = '../dataset/flipkart_products_preprocessed_2.csv'

In [3]:
df = pd.read_csv(INPUT_FILE)

In [4]:
df["text"] = (
    df["title"].fillna("") + " " +
    df["description"].fillna("") + " " +
    df["level_combined"].fillna("")
)

In [5]:
features = [
    "price",
    "rating_value",
    "level_2",
    "level_3", 
    "text"
]

X = df[features]
y = df["discount"]

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from category_encoders.target_encoder import TargetEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [7]:
numeric_features = [
    "price",
    "rating_value"
]

categorical_features = [
    "level_2",
    "level_3"
]

text_features = [
    "text"
]

In [8]:
numeric_transformer = StandardScaler()
categorical_transformer = TargetEncoder()

text_tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
        ("text_tfidf", text_tfidf, "text")
    ],
    remainder="drop"
)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
pipeline = Pipeline(
    steps = [ 
        ("preprocess", preprocessor),
        ("model", RandomForestRegressor(
            n_estimators=120,
            max_depth=150,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features="sqrt",
            n_jobs=-1,
            random_state=42
        ))
    ]
)

In [12]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_estimators,120
,criterion,'squared_error'
,max_depth,150
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
y_pred = pipeline.predict(X_test)

In [14]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R^2:", r2)

RMSE: 11.899358033624896
R^2: 0.6200089195947398


In [15]:
from sklearn.model_selection import RandomizedSearchCV

In [16]:
param_dist = {
    "model__n_estimators": [200, 400, 600],
    "model__max_depth": [10, 20, 40, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log2", 0.3, 0.5, 0.7],
    "model__bootstrap": [True],
    "model__max_samples": [0.6, 0.8, None],
}

In [17]:
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(random_state=42, n_jobs=-1))
])

In [None]:
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_root_mean_squared_error",
    cv=2,
    verbose=2,
    n_jobs=1,
    random_state=42
)

In [None]:
search.fit(X_train, y_train)

print("Best Params:", search.best_params_)
print("Best Score:", -search.best_score_)

In [None]:
best_rf = search.best_estimator_

In [22]:
best_rf.get_params()

{'memory': None,
 'steps': [('preprocess',
   ColumnTransformer(transformers=[('num', StandardScaler(),
                                    ['price', 'rating_value']),
                                   ('cat', TargetEncoder(),
                                    ['level_2', 'level_3']),
                                   ('text_tfidf',
                                    TfidfVectorizer(max_features=5000,
                                                    stop_words='english'),
                                    'text')])),
  ('model',
   RandomForestRegressor(max_features=0.7, max_samples=0.8, min_samples_split=10,
                         n_estimators=200, n_jobs=-1, random_state=42))],
 'transform_input': None,
 'verbose': False,
 'preprocess': ColumnTransformer(transformers=[('num', StandardScaler(),
                                  ['price', 'rating_value']),
                                 ('cat', TargetEncoder(),
                                  ['level_2', 'level_3']),
  

In [23]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_features=0.7,
    max_samples=0.8,
    min_samples_split=10,
    n_jobs=-1,
    random_state=42
)

In [24]:
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", rf)
])

In [25]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.7
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [26]:
y_pred = pipeline.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R^2:", r2)

RMSE: 10.738239885151772
R^2: 0.6905485359892047
