In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Load the dataset
file_path = "combined_product_data_Amazon_1001_2000_cricketJersey.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())


                                        ProductTitle  \
0      MI Jersey Rohit Sharma 45 2024 for Men & Boys   
1     Men Destroyer Football Jersey Set Full Sleeves   
2  India 50 Over World Cup Jersey 2023 (Half Slee...   
3  Cricket Jersey for Men CSK Cricket Jersey 2024...   
4   Sports CSK & RCB Jersey 2023/2024 Cricket Jersey   

                                      LandingPageURL  \
0  https://www.amazon.in/Jersey-Sharma-2023-2024-...   
1  https://www.amazon.in/Nivia-Destroyer-Football...   
2  https://www.amazon.in/India-Jersey-Sleeves-NO-...   
3  https://www.amazon.in/Cricket-Jersey-Tshirt-7-...   
4  https://www.amazon.in/Sports-Jersey-Cricket-X-...   

                                      MasterCategory  \
0  ['Clothing & Accessories', 'Men', 'T-shirts, P...   
1                                                NaN   
2                                                NaN   
3                                                NaN   
4                                             

In [45]:
# Check if the target column 'EstimatedSales' exists; if not, create it with NaNs
if 'EstimatedSales' not in data.columns:
    data['EstimatedSales'] = pd.NA

# Exploratory Data Analysis (EDA)
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)


Missing values in each column:
 ProductTitle                0
LandingPageURL              0
MasterCategory            793
SubCategory               793
ASIN                      793
Product_description       799
BulletPoints              793
NumberOfBulletPoints      793
NumberOfVariants          793
ImageLinks                793
NumberofImages            793
AdditionalInformation     793
Price                     793
MRP                       793
Discount                  793
Rating                    793
Review                    793
AllStartCounts            793
Brand                     793
ProductDetails            793
EstimatedSales           1000
dtype: int64


In [46]:
# Feature selection: Select relevant features for training the model
features = ['Price', 'NumberOfVariants', 'NumberOfBulletPoints', 'NumberofImages', 'Rating', 'Review', 'Discount']

# Remove commas from numeric columns to convert them to float
numeric_columns = ['Price', 'NumberOfVariants', 'NumberOfBulletPoints', 'NumberofImages', 'Rating', 'Review', 'Discount']
data[numeric_columns] = data[numeric_columns].replace(',', '', regex=True)

# Identify and handle non-numeric values in numeric columns
for col in numeric_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')


In [48]:
data.sample(3)

Unnamed: 0,ProductTitle,LandingPageURL,MasterCategory,SubCategory,ASIN,Product_description,BulletPoints,NumberOfBulletPoints,NumberOfVariants,ImageLinks,...,AdditionalInformation,Price,MRP,Discount,Rating,Review,AllStartCounts,Brand,ProductDetails,EstimatedSales
541,IPL Jersey Gujrat 2024 with Customized Name & ...,https://www.amazon.in/YOURJERSEY-Jersey-Gujrat...,,,,,,,,,...,,,,,,,,,,
466,Cricket Blue Blue Jersey 2023-2024 Cricket (Ki...,https://www.amazon.in/JUSBALL-Cricket-Blue-Jer...,,,,,,,,,...,,,,,,,,,,
181,Mumbai Halfsleeve Cricket Jersey 2024 for Boys...,https://www.amazon.in/Mumbai-Halfsleeve-Cricke...,,,,,,,,,...,,,,,,,,,,


In [49]:
# Fill missing feature values with the median of the respective columns
data[features] = data[features].fillna(data[features].median())

# Create a mock target variable based on available features
np.random.seed(42)
data['EstimatedSales'] = (
    data['Price'] * (1 - data['Discount'] / 100) * data['Rating'] * data['Review']
).fillna(0) + np.random.normal(0, 10, len(data))

# Check the distribution of the mock target variable
print(data['EstimatedSales'].describe())


count    1000.000000
mean        0.193321
std         9.792159
min       -32.412673
25%        -6.475903
50%         0.253006
75%         6.479439
max        38.527315
Name: EstimatedSales, dtype: float64


In [50]:
# Separate the data to predict
predict_data = data[data['EstimatedSales'].isna()]

# Separate the features (X) and the target (y) for the training data
X = data[features]
y = data['EstimatedSales']

# Check if there are any non-null values in the target column for training
if y.notnull().sum() == 0:
    print("Warning: No non-null values in the target column 'EstimatedSales'. Training the model may not be effective.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [55]:
# # Initialize models
# models = {
#     "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
#     "Linear Regression": LinearRegression(),
#     "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
#     "Support Vector Regressor": SVR()
# }

# # Dictionary to store the results
# results = {}

# # Train and evaluate each model
# for model_name, model in models.items():
#     print(f"Training {model_name}...")
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     mse = mean_squared_error(y_test, y_pred)
#     r2 = r2_score(y_test, y_pred)
#     mae = mean_absolute_error(y_test, y_pred)
#     results[model_name] = {
#         "Mean Squared Error": mse,
#         "R2 Score": r2,
#         "Mean Absolute Error": mae
#     }
#     print(f"{model_name} Results:")
#     print(f"Mean Squared Error: {mse}")
#     print(f"R2 Score: {r2}")
#     print(f"Mean Absolute Error: {mae}\n")


In [56]:
# # Initialize models
# models = {
#     "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
#     "Linear Regression": LinearRegression(),
#     "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
#     "Support Vector Regressor": SVR()
# }

# # Dictionary to store the results
# results = {}

# # Filter out rows with NaN values in the features or target
# non_nan_data = data.dropna(subset=features + ['EstimatedSales'])

# # Separate the features (X) and the target (y)
# X_non_nan = non_nan_data[features]
# y_non_nan = non_nan_data['EstimatedSales']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_non_nan, y_non_nan, test_size=0.2, random_state=42)

# # Train and evaluate each model
# for model_name, model in models.items():
#     print(f"Training {model_name}...")
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     mse = mean_squared_error(y_test, y_pred)
#     r2 = r2_score(y_test, y_pred)
#     mae = mean_absolute_error(y_test, y_pred)
#     results[model_name] = {
#         "Mean Squared Error": mse,
#         "R2 Score": r2,
#         "Mean Absolute Error": mae
#     }
#     print(f"{model_name} Results:")
#     print(f"Mean Squared Error: {mse}")
#     print(f"R2 Score: {r2}")
#     print(f"Mean Absolute Error: {mae}\n")


In [57]:
# Initialize models
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Linear Regression": LinearRegression(),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "Support Vector Regressor": SVR()
}

# Dictionary to store the results
results = {}

# Filter out rows with NaN values in the features or target
non_nan_data = data.dropna(subset=features + ['EstimatedSales'])

# Check if there are any rows left after dropping NaN values
if non_nan_data.empty:
    raise ValueError("No samples available after filtering out NaN values. Check your dataset.")

# Separate the features (X) and the target (y)
X_non_nan = non_nan_data[features]
y_non_nan = non_nan_data['EstimatedSales']

# Check if there are any non-null target values for training
if X_non_nan.empty or y_non_nan.empty:
    raise ValueError("No non-null values in the features or target column 'EstimatedSales'. Cannot train the model.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_non_nan, y_non_nan, test_size=0.2, random_state=42)

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    results[model_name] = {
        "Mean Squared Error": mse,
        "R2 Score": r2,
        "Mean Absolute Error": mae
    }
    print(f"{model_name} Results:")
    print(f"Mean Squared Error: {mse}")
    print(f"R2 Score: {r2}")
    print(f"Mean Absolute Error: {mae}\n")


ValueError: No samples available after filtering out NaN values. Check your dataset.

In [58]:
# Check for NaN values in the dataset
print(data.isna().sum())

# Check for NaN values specifically in the features and target columns
print(data[features + ['EstimatedSales']].isna().sum())


ProductTitle                0
LandingPageURL              0
MasterCategory            793
SubCategory               793
ASIN                      793
Product_description       799
BulletPoints              793
NumberOfBulletPoints        0
NumberOfVariants         1000
ImageLinks                793
NumberofImages              0
AdditionalInformation     793
Price                       0
MRP                       793
Discount                 1000
Rating                      0
Review                      0
AllStartCounts            793
Brand                     793
ProductDetails            793
EstimatedSales              0
dtype: int64
Price                      0
NumberOfVariants        1000
NumberOfBulletPoints       0
NumberofImages             0
Rating                     0
Review                     0
Discount                1000
EstimatedSales             0
dtype: int64


In [59]:
# Display a sample of the dataset
print(data.sample(10))


                                          ProductTitle  \
541  IPL Jersey Gujrat 2024 with Customized Name & ...   
466  Cricket Blue Blue Jersey 2023-2024 Cricket (Ki...   
181  Mumbai Halfsleeve Cricket Jersey 2024 for Boys...   
332  Killer Blue New Cricket Jersey 2023-2024 Crick...   
40   T-Shirt for Men - 100% Cotton Casual T-Shirt |...   
890  Prokick Elite Half Sleeves Cricket T-Shirt Off...   
80   Slim Fit Short Sleeve Red Color Mens T Shirt-K...   
198            Techfit Men’s Round Neck Sports T-Shirt   
505  Soccer Sports RCB Virat Kohli 18 Red Cricket T...   
586  Lucknow Blue Cricket Team Sports Mens Half Sle...   

                                        LandingPageURL MasterCategory  \
541  https://www.amazon.in/YOURJERSEY-Jersey-Gujrat...            NaN   
466  https://www.amazon.in/JUSBALL-Cricket-Blue-Jer...            NaN   
181  https://www.amazon.in/Mumbai-Halfsleeve-Cricke...            NaN   
332  https://www.amazon.in/JUSBALL-Killer-Cricket-J...            NaN

In [60]:
# Fill missing feature values with the median of the respective columns
data[features] = data[features].fillna(data[features].median())

# Fill missing target values if you have a valid strategy for it
# For example, you could drop rows with missing target values
data = data.dropna(subset=['EstimatedSales'])


In [61]:
# Drop rows with missing values in the features or target column
data = data.dropna(subset=features + ['EstimatedSales'])


In [43]:
# Predict the missing values using the best model (choose the best model based on the evaluation metrics)
best_model_name = min(results, key=lambda k: results[k]["Mean Squared Error"])
best_model = models[best_model_name]
X_predict = predict_data[features]
predicted_sales = best_model.predict(X_predict)

# Add the predicted values back to the original dataframe
data.loc[data['EstimatedSales'].isna(), 'EstimatedSales'] = predicted_sales

# Market sizing calculation
market_sizing = data['EstimatedSales'].sum()
print("Estimated Market Sizing:", market_sizing)

# Save the updated dataset with estimated sales
data.to_csv("updated_product_data.csv", index=False)


Series([], Name: EstimatedSales, dtype: object)