In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [29]:
df = pd.read_csv('product_data.csv')

In [30]:
df

Unnamed: 0,product_id,product_type,timestamp,actual_price,promotional_price,competitor1,competitor2,competitor3,competitor4,competitor5,competitor6,competitor7,competitor8,competitor9,competitor10
0,product1,electronics,1.718599e+09,1413.86,1224.30,1251.19,1269.74,1310.96,1337.50,1205.94,1266.77,1263.79,1399.98,1284.61,1304.23
1,product2,headphones,1.718573e+09,346.85,299.38,341.17,301.71,,,,,,,,
2,product3,beverages,1.718539e+09,2.47,2.10,2.42,2.16,2.42,2.27,2.44,2.43,2.29,2.40,2.39,2.43
3,product4,beverages,1.718508e+09,1.13,0.96,1.07,1.04,1.03,1.05,1.10,,,,,
4,product5,smartphones,1.718497e+09,617.34,527.41,524.79,582.33,604.97,553.51,546.57,577.31,583.88,529.86,592.11,541.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,product996,headphones,1.674122e+09,153.78,136.97,137.72,131.69,,,,,,,,
996,product997,smartphones,1.674039e+09,620.15,529.28,564.97,540.74,549.48,557.77,548.65,534.21,532.65,539.18,591.07,542.85
997,product998,headphones,1.674004e+09,194.34,166.98,184.06,181.48,,,,,,,,
998,product999,smartphones,1.673970e+09,1096.65,941.11,1042.84,1042.29,,,,,,,,


In [31]:
## Label Encoding


# Example: Label encoding for product_id and competitor_id
# label_encoder = LabelEncoder()

# df['product_id'] = label_encoder.fit_transform(df['product_id'])

# # Optionally, you can also encode product_type if it's categorical
# df['product_type'] = label_encoder.fit_transform(df['product_type'])

# # Check the encoded columns
# print(df[['product_id',  'product_type']].head())

In [32]:
# ##  One-Hot Encoding

# one_hot_encoder = OneHotEncoder(sparse=False)

# # Fit-transform the data and add new columns to the DataFrame
# product_type_encoded = one_hot_encoder.fit_transform(df[['product_type']])
# product_type_encoded_df = pd.DataFrame(product_type_encoded, columns=one_hot_encoder.get_feature_names(['product_type']))

# # Concatenate with original DataFrame
# df = pd.concat([df, product_type_encoded_df], axis=1)

# # Drop original categorical column (if needed)
# df.drop(columns=['product_type'], inplace=True)

# # Check the encoded DataFrame
# print(df.head())

In [33]:
# Fill missing values with a placeholder (e.g., -1) for competitor columns
# for col in features.columns:
#     if 'competitor' in col:
#         features[col].fillna(-1, inplace=True)  # Fill missing competitor prices with -1 or any other appropriate placeholder

# features = features.fillna(features.mean())

competitor_columns = df.filter(like='competitor').columns

print(competitor_columns)

# Fill NaN values in competitor columns with the mean of that row
# df['competitor_mean'] = df[competitor_columns].mean(axis=1)

# print(df.head())
# for col in competitor_columns:
#     df[col].fillna(df['competitor_mean'], inplace=True)

# df.drop(columns=['competitor_mean'], inplace=True)

df[competitor_columns] = df[competitor_columns].apply(lambda x: x.fillna(x.mean()), axis=1)
df['max_discount'] = np.round(df['actual_price'] - df[competitor_columns].min(axis=1), 2)

df['mmin_price'] = df[competitor_columns].min(axis=1)

# Feature engineering: Extract relevant features and target
features = df.drop(columns=['product_id', 'product_type', 'timestamp', 'promotional_price'])
target = df['promotional_price']  # Target variable: actual price

print(features)

Index(['competitor1', 'competitor2', 'competitor3', 'competitor4',
       'competitor5', 'competitor6', 'competitor7', 'competitor8',
       'competitor9', 'competitor10'],
      dtype='object')
     actual_price  competitor1  competitor2  competitor3  competitor4  \
0         1413.86      1251.19      1269.74     1310.960     1337.500   
1          346.85       341.17       301.71      321.440      321.440   
2            2.47         2.42         2.16        2.420        2.270   
3            1.13         1.07         1.04        1.030        1.050   
4          617.34       524.79       582.33      604.970      553.510   
..            ...          ...          ...          ...          ...   
995        153.78       137.72       131.69      134.705      134.705   
996        620.15       564.97       540.74      549.480      557.770   
997        194.34       184.06       181.48      182.770      182.770   
998       1096.65      1042.84      1042.29     1042.565     1042.565   
99

In [34]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)


X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

# Fit the model
model.fit(X_train_encoded, y_train)

# Make predictions
y_pred = model.predict(X_test_encoded)

In [35]:
# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 28.17815439833484


In [36]:
# Make predictions for the entire dataset
df_predicted = df
df_predicted['predicted_prime'] = np.round(model.predict(features),2)

# Export DataFrame to CSV with predicted promotional price
file_path_with_predictions = 'products_with_predicted_prime.csv'
df_predicted.to_csv(file_path_with_predictions, index=False)

print(f"Data with predicted prime saved to {file_path_with_predictions}")

Data with predicted prime saved to products_with_predicted_prime.csv


In [49]:
# Create the modelabs

import joblib

# Assuming 'model' is your trained RandomForestRegressor model
joblib.dump(model, 'pricing.pkl')

['pricing.pkl']