In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
import joblib

usd_to_inr_rate = 83.05


data = pd.read_csv('../data/resale_luxury_data.csv')

print(data.columns)
data = data[['brand_name', 'product_name', 'product_condition', 'price_usd', 'seller_price']]

data.dropna(inplace=True)

data['price_inr'] = data['price_usd'] * usd_to_inr_rate
data['seller_price'] = data['seller_price'] * usd_to_inr_rate

MIN_FREQUENCY = 10

categorical_features = ['brand_name', 'product_name', 'product_condition']

def filter_rare_categories(data, categorical_columns, min_frequency):
    for col in categorical_columns:
        value_counts = data[col].value_counts()
        frequent_categories = value_counts[value_counts >= min_frequency].index
        data.loc[~data[col].isin(frequent_categories), col] = 'Other'
    return data

data = filter_rare_categories(data, categorical_features, MIN_FREQUENCY)

X = data[['brand_name', 'product_name', 'product_condition', 'price_inr']]
y = data['seller_price']

categorical_transformer = OneHotEncoder(
    handle_unknown='ignore', 
    sparse_output=True,
    min_frequency=MIN_FREQUENCY
)

numeric_features = ['price_inr']
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features)
    ])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(solver='sparse_cg')) 
])


param_grid = {
    'regressor__alpha': [0.1, 1.0, 10.0],
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

best_model = grid_search.best_estimator_

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model training completed.")
print("Best hyperparameters:", grid_search.best_params_)
print("Training Score:", best_model.score(X_train, y_train))
print("Test Score:", best_model.score(X_test, y_test))
print("Mean Squared Error:", mse)
print("R-squared:", r2)

joblib.dump(best_model, '../backend\model.pkl')

  joblib.dump(best_model, '../backend\model.pkl')


Index(['product_id', 'product_type', 'product_name', 'product_description',
       'product_keywords', 'product_gender_target', 'product_category',
       'product_season', 'product_condition', 'product_like_count', 'sold',
       'reserved', 'available', 'in_stock', 'should_be_gone', 'brand_id',
       'brand_name', 'brand_url', 'product_material', 'product_color',
       'price_usd', 'seller_price', 'seller_earning', 'seller_badge',
       'has_cross_border_fees', 'buyers_fees', 'warehouse_name', 'seller_id',
       'seller_username', 'usually_ships_within', 'seller_country',
       'seller_products_sold', 'seller_num_products_listed',
       'seller_community_rank', 'seller_num_followers', 'seller_pass_rate'],
      dtype='object')
Model training completed.
Best hyperparameters: {'regressor__alpha': 10.0}
Training Score: 0.9966900280105463
Test Score: 0.9965713209022286
Mean Squared Error: 74143469.4976534
R-squared: 0.9965713209022286


['../backend\\model.pkl']