In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [2]:
# Load the data from the CSV file into a pandas DataFrame
df = pd.read_csv('Data_Pull.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581 entries, 0 to 580
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Product Name    581 non-null    object 
 1   MRP             572 non-null    float64
 2   Price           580 non-null    object 
 3   Star Ratings    535 non-null    float64
 4   No. of Ratings  535 non-null    float64
 5   Sales           89 non-null     object 
 6   Brand           581 non-null    object 
dtypes: float64(3), object(4)
memory usage: 31.9+ KB


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MRP,572.0,1054.810804,561.714856,199.0,645.5,999.0,1299.0,5649.0
Star Ratings,535.0,4.094579,0.419189,2.7,3.9,4.1,4.3,5.0
No. of Ratings,535.0,519.573832,1445.131463,1.0,12.0,59.0,310.0,12555.0


In [7]:
df.isnull().sum()

Product Name        0
MRP                 9
Price               1
Star Ratings       46
No. of Ratings     46
Sales             492
Brand               0
dtype: int64

In [8]:
# Cleaning numerical columns: remove commas and convert to float
def clean_numeric_column(column):
    return column.str.replace(',', '').astype(float)

df['MRP'] = clean_numeric_column(df['MRP'].astype(str))
df['Price'] = clean_numeric_column(df['Price'].astype(str))
df['Star Ratings'] = clean_numeric_column(df['Star Ratings'].astype(str))
df['No. of Ratings'] = clean_numeric_column(df['No. of Ratings'].astype(str))

# Handling missing values
df['MRP'].fillna(df['MRP'].median(), inplace=True)
df['Price'].fillna(df['Price'].median(), inplace=True)
df['Star Ratings'].fillna(df['Star Ratings'].mean(), inplace=True)
df['No. of Ratings'].fillna(df['No. of Ratings'].mean(), inplace=True)

In [12]:
#dataset after cleaning
df.isnull().sum()

Product Name        0
MRP                 0
Price               0
Star Ratings        0
No. of Ratings      0
Sales             492
Brand               0
dtype: int64

In [16]:
# Task 1: Extract the brand name
def extract_brand(product_name):
    return product_name.split()[0]

df['Brand'] = df['Product Name'].apply(extract_brand)
df

Unnamed: 0,Product Name,MRP,Price,Star Ratings,No. of Ratings,Sales,Brand
0,STITCHNEST Unique Cute Elephant Cartoon Blue P...,999.0,379.0,4.400000,4284.000000,1000,STITCHNEST
1,ACN Kohinoor | Macrame Cushion Cover 16 X 16 I...,1199.0,349.0,4.100000,149.000000,300+ bought in past month,ACN
2,Kaahira Handmade Cotton Macrame Cushion Pillow...,1289.0,349.0,4.000000,410.000000,400+ bought in past month,Kaahira
3,AEROHAVEN Premium Set of 5 Geometric Cotton Di...,999.0,486.0,4.200000,239.000000,200+ bought in past month,AEROHAVEN
4,Brick Home Floral Printed Poly Cotton Cushion ...,999.0,549.0,4.200000,902.000000,200+ bought in past month,Brick
...,...,...,...,...,...,...,...
576,The White Willow Memory Foam Decorative Square...,999.0,299.0,5.000000,7.000000,,The
577,SEEVO 400 TC Luxurious Satin Silk Pillow Cover...,898.0,449.0,4.100000,284.000000,,SEEVO
578,VOMZER Premium 300 TC Cotton Printed Pillow Co...,1299.0,229.0,5.000000,1.000000,,VOMZER
579,VAS COLLECTIONS® 105 TC Premium Cotton King Si...,999.0,379.0,4.094579,519.573832,,VAS


In [20]:
# Task 2: Predict monthly sales
# Assuming "bought in past month" can be used as a rough estimate for monthly sales
def estimate_monthly_sales(sales_info):
    if pd.isna(sales_info):
        return 0  # Assigning 0 if sales info is NaN
    if isinstance(sales_info, str) and "bought in past month" in sales_info:
        return int(sales_info.split("+")[0])
    return int(sales_info)
df['Monthly Sales Estimate'] = df['Sales'].apply(lambda x: estimate_monthly_sales(x))

df

Unnamed: 0,Product Name,MRP,Price,Star Ratings,No. of Ratings,Sales,Brand,Monthly Sales Estimate
0,STITCHNEST Unique Cute Elephant Cartoon Blue P...,999.0,379.0,4.400000,4284.000000,1000,STITCHNEST,1000
1,ACN Kohinoor | Macrame Cushion Cover 16 X 16 I...,1199.0,349.0,4.100000,149.000000,300+ bought in past month,ACN,300
2,Kaahira Handmade Cotton Macrame Cushion Pillow...,1289.0,349.0,4.000000,410.000000,400+ bought in past month,Kaahira,400
3,AEROHAVEN Premium Set of 5 Geometric Cotton Di...,999.0,486.0,4.200000,239.000000,200+ bought in past month,AEROHAVEN,200
4,Brick Home Floral Printed Poly Cotton Cushion ...,999.0,549.0,4.200000,902.000000,200+ bought in past month,Brick,200
...,...,...,...,...,...,...,...,...
576,The White Willow Memory Foam Decorative Square...,999.0,299.0,5.000000,7.000000,,The,0
577,SEEVO 400 TC Luxurious Satin Silk Pillow Cover...,898.0,449.0,4.100000,284.000000,,SEEVO,0
578,VOMZER Premium 300 TC Cotton Printed Pillow Co...,1299.0,229.0,5.000000,1.000000,,VOMZER,0
579,VAS COLLECTIONS® 105 TC Premium Cotton King Si...,999.0,379.0,4.094579,519.573832,,VAS,0


In [22]:
# We can use columns MRP, Price, Star Ratings, No. of Ratings for prediction
features = df[['MRP', 'Price', 'Star Ratings', 'No. of Ratings']]
target = df['Monthly Sales Estimate']

In [23]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [24]:
# Make predictions
y_pred = model.predict(X_test)

In [29]:

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
# mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
# r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
# print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')
# print(f'R-squared: {r2}')

# Adding predictions back to the dataframe
df['Predicted Monthly Sales'] = model.predict(features)

Mean Squared Error: 2240.8798005052945
Root Mean Squared Error: 47.33793194157614




In [30]:
# Adding predictions back to the dataframe
df['Predicted Monthly Sales'] = model.predict(features)

df

Unnamed: 0,Product Name,MRP,Price,Star Ratings,No. of Ratings,Sales,Brand,Monthly Sales Estimate,Predicted Monthly Sales
0,STITCHNEST Unique Cute Elephant Cartoon Blue P...,999.0,379.0,4.400000,4284.000000,1000,STITCHNEST,1000,166.023326
1,ACN Kohinoor | Macrame Cushion Cover 16 X 16 I...,1199.0,349.0,4.100000,149.000000,300+ bought in past month,ACN,300,33.800779
2,Kaahira Handmade Cotton Macrame Cushion Pillow...,1289.0,349.0,4.000000,410.000000,400+ bought in past month,Kaahira,400,43.111406
3,AEROHAVEN Premium Set of 5 Geometric Cotton Di...,999.0,486.0,4.200000,239.000000,200+ bought in past month,AEROHAVEN,200,20.500169
4,Brick Home Floral Printed Poly Cotton Cushion ...,999.0,549.0,4.200000,902.000000,200+ bought in past month,Brick,200,36.291997
...,...,...,...,...,...,...,...,...,...
576,The White Willow Memory Foam Decorative Square...,999.0,299.0,5.000000,7.000000,,The,0,41.237345
577,SEEVO 400 TC Luxurious Satin Silk Pillow Cover...,898.0,449.0,4.100000,284.000000,,SEEVO,0,21.817258
578,VOMZER Premium 300 TC Cotton Printed Pillow Co...,1299.0,229.0,5.000000,1.000000,,VOMZER,0,54.590882
579,VAS COLLECTIONS® 105 TC Premium Cotton King Si...,999.0,379.0,4.094579,519.573832,,VAS,0,38.432708


In [31]:
# Save the dataframe to a CSV file
df.to_csv('enhanced_product_data.csv', index=False)

### To compare different machine learning models, we'll use a few common regression algorithms and evaluate their performance using the same metrics. The models we'll use are:

Linear Regression


Decision Tree Regressor


Random Forest Regressor


Gradient Boosting Regressor

In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [50]:
# Define the models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42)
}

# Train and evaluate each model
results = {}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    # Make predictions
    y_pred = model.predict(X_test)
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    
    # Store the results
    results[model_name] = {
        'Mean Squared Error': mse,
        'Mean Absolute Error': mae,
        'Root Mean Squared Error': rmse,
        'R-squared': r2
    }

    print(f"Model: {model_name}")
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"R-squared: {r2}\n")

# Adding predictions of the best model back to the dataframe
best_model_name = max(results, key=lambda k: results[k]['R-squared'])
best_model = models[best_model_name]
df['Predicted Monthly Sales'] = best_model.predict(features)

# Save the dataframe to a CSV file
df.to_csv('enhanced_product_data.csv', index=False)

# Display a sample of the dataframe
print(df.head())

# Print the best model
print(f"The best model is: {best_model_name} with R-squared: {results[best_model_name]['R-squared']}")



Model: Linear Regression
Mean Squared Error: 2240.8798005052945
Mean Absolute Error: 28.73546214865584
Root Mean Squared Error: 47.33793194157614
R-squared: 0.2400494589590738

Model: Decision Tree Regressor
Mean Squared Error: 3231.837606837607
Mean Absolute Error: 20.299145299145298
Root Mean Squared Error: 56.84925335338721
R-squared: -0.0960144927536235

Model: Random Forest Regressor
Mean Squared Error: 5541.069948955365
Mean Absolute Error: 28.17663817663818
Root Mean Squared Error: 74.4383634220646
R-squared: -0.8791454609500811

Model: Gradient Boosting Regressor
Mean Squared Error: 4307.360601035125
Mean Absolute Error: 27.58287941159558
Root Mean Squared Error: 65.63048530245015
R-squared: -0.460757073394521

                                        Product Name     MRP  Price  \
0  STITCHNEST Unique Cute Elephant Cartoon Blue P...   999.0  379.0   
1  ACN Kohinoor | Macrame Cushion Cover 16 X 16 I...  1199.0  349.0   
2  Kaahira Handmade Cotton Macrame Cushion Pillow...  1289

