##Le meme debut que d`habitude

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

In [None]:
# Load the CSV file from Google Drive
file_path = '/content/drive/My Drive/co_purchases_version.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

In [None]:
# Clean the category_code column to remove "appliances." prefix
df['category_code'] = df['category_code'].str.replace('appliances.', '')

# Display the first few rows to verify the changes
df.head()


In [None]:
df['co_purchases'] = df['co_purchases'].str.replace('appliances.', '')
df.head(5)

In [None]:
df.shape

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# Group by product_id, hour, day_of_week, category_code, and brand and aggregate data
product_sales = df.groupby(['product_id', 'hour', 'day_of_week', 'category_code', 'brand']).agg({
    'price': 'sum',
    'user_session': 'count'
}).reset_index().rename(columns={'price': 'total_sales', 'user_session': 'num_purchases'})


In [None]:
product_sales.head()

In [None]:
# Assuming product_sales is your aggregated DataFrame
product_sales.to_csv('/content/drive/MyDrive/product_sales_aggregated.csv', index=False)


##Sales Model Prediction

####First model

In [None]:
# Train-test split
X = product_sales[['product_id', 'hour', 'day_of_week', 'category_code', 'brand']]
y = product_sales['total_sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Encode categorical features
X_train = pd.get_dummies(X_train, columns=['category_code', 'brand'])
X_test = pd.get_dummies(X_test, columns=['category_code', 'brand'])


In [None]:
# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [None]:
# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")

Mean Squared Error: 153799.1698056103


#### Bad results so lets try to improve the model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:

# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define the model
model = RandomForestRegressor(random_state=42)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import randint

In [None]:
# Define hyperparameters to tune
param_dist = {
    'n_estimators': randint(50, 150),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Use RandomizedSearchCV to find the best hyperparameters with fewer iterations
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=20, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Mean Squared Error: {mse}")

Fitting 3 folds for each of 20 candidates, totalling 60 fits


  warn(


Best Parameters: {'bootstrap': True, 'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 53}
Mean Squared Error: 155580.6190473555


####Better result but still not good so next try

In [None]:

# Define the model with default max_features explicitly set
model = RandomForestRegressor(random_state=42, max_features='sqrt')

# Define hyperparameters to tune
param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Use RandomizedSearchCV to find the best hyperparameters with fewer iterations
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=20, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Mean Squared Error: {mse}")


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 107}
Mean Squared Error: 190344.80028427212


####Fucking terrible, next one

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import randint


# Create additional features
df['hour_squared'] = df['hour'] ** 2
df['hour_day_interaction'] = df['hour'] * df['day_of_week']

# Group by product_id, hour, day_of_week, category_code, brand, and new features, and aggregate data
product_sales = df.groupby(['product_id', 'hour', 'day_of_week', 'category_code', 'brand', 'hour_squared', 'hour_day_interaction']).agg({
    'price': 'sum',
    'user_session': 'count'
}).reset_index().rename(columns={'price': 'total_sales', 'user_session': 'num_purchases'})

# Train-test split
X = product_sales[['product_id', 'hour', 'day_of_week', 'category_code', 'brand', 'hour_squared', 'hour_day_interaction']]
y = product_sales['total_sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical features using one-hot encoding
X_train = pd.get_dummies(X_train, columns=['category_code', 'brand'])
X_test = pd.get_dummies(X_test, columns=['category_code', 'brand'])

# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define the model with default max_features explicitly set
model = RandomForestRegressor(random_state=42)

# Define hyperparameters to tune
param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Use RandomizedSearchCV to find the best hyperparameters with fewer iterations and folds
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Mean Squared Error: {mse}")


Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import randint

df['hour_squared'] = df['hour'] ** 2
df['hour_day_interaction'] = df['hour'] * df['day_of_week']

# Group by product_id, hour, day_of_week, category_code, brand, and new features, and aggregate data
product_sales = df.groupby(['product_id', 'hour', 'day_of_week', 'category_code', 'brand', 'hour_squared', 'hour_day_interaction']).agg({
    'price': 'sum',
    'user_session': 'count'
}).reset_index().rename(columns={'price': 'total_sales', 'user_session': 'num_purchases'})

# Train-test split
X = product_sales[['product_id', 'hour', 'day_of_week', 'category_code', 'brand', 'hour_squared', 'hour_day_interaction']]
y = product_sales['total_sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical features using one-hot encoding
X_train = pd.get_dummies(X_train, columns=['category_code', 'brand'])
X_test = pd.get_dummies(X_test, columns=['category_code', 'brand'])

# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define the model with default max_features explicitly set
model = RandomForestRegressor(random_state=42, max_features='sqrt')

# Define hyperparameters to tune
param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Use RandomizedSearchCV to find the best hyperparameters with fewer iterations and folds
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
random_search.fit(X_train, y_train)

# Check if the fitting was successful
if hasattr(random_search, 'best_params_'):
    # Get the best model
    best_model = random_search.best_estimator_

    # Predict and evaluate
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Best Parameters: {random_search.best_params_}")
    print(f"Mean Squared Error: {mse}")
else:
    print("RandomizedSearchCV fitting was not successful.")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 92}
Mean Squared Error: 290531.81666190474


##Time and Day prediction

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import randint
from itertools import product
import numpy as np


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import randint
from itertools import product
import numpy as np

# Group by product_id, hour, day_of_week, category_code, and brand, and aggregate data
product_sales = df.groupby(['product_id', 'hour', 'day_of_week', 'category_code', 'brand']).agg({
    'price': 'sum',
    'user_session': 'count'
}).reset_index().rename(columns={'price': 'total_sales', 'user_session': 'num_purchases'})

# Train-test split
X = product_sales[['product_id', 'hour', 'day_of_week', 'category_code', 'brand']]
y = product_sales['total_sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical features using one-hot encoding
X_train = pd.get_dummies(X_train, columns=['category_code', 'brand'])
X_test = pd.get_dummies(X_test, columns=['category_code', 'brand'])

# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define the model
model = RandomForestRegressor(random_state=42, max_features='sqrt')

# Define hyperparameters to tune
param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Use RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Predicting the best hour and day of the week for advertising

# Generate all combinations of product_id, hour, and day_of_week
product_ids = df['product_id'].unique()
hours = range(24)
days = range(7)

# Create a DataFrame for all combinations
combinations = list(product(product_ids, hours, days))
prediction_data = pd.DataFrame(combinations, columns=['product_id', 'hour', 'day_of_week'])

# Add category_code and brand
prediction_data = prediction_data.merge(df[['product_id', 'category_code', 'brand']].drop_duplicates(), on='product_id', how='left')

# Encode categorical features using one-hot encoding
prediction_data_encoded = pd.get_dummies(prediction_data, columns=['category_code', 'brand'])

# Ensure all columns are present in the prediction data
prediction_data_encoded, _ = prediction_data_encoded.align(X_train, join='left', axis=1, fill_value=0)

# Add missing columns with zeros
missing_cols = set(X_train.columns) - set(prediction_data_encoded.columns)
for col in missing_cols:
    prediction_data_encoded[col] = 0
prediction_data_encoded = prediction_data_encoded[X_train.columns]

# Predict total sales for each combination
prediction_data['predicted_sales'] = best_model.predict(prediction_data_encoded)

# Find the best hour and day for advertising
best_times = prediction_data.groupby(['hour', 'day_of_week'])['predicted_sales'].sum().reset_index()
best_times = best_times.sort_values(by='predicted_sales', ascending=False)

print(best_times.head())  # Display top times for advertising


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 125}
Mean Squared Error: 166002.4465692399
Root Mean Squared Error: 407.4339781722186
Mean Absolute Error: 179.46817912237657
R-squared: 0.73737367107838
     hour  day_of_week  predicted_sales
104    14            6     1.389052e+06
90     12            6     1.381745e+06
83     11            6     1.379332e+06
76     10            6     1.373844e+06
111    15            6     1.367360e+06


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import randint
from itertools import product
import numpy as np

# Group by product_id, hour, day_of_week, category_code, and brand, and aggregate data
product_sales = df.groupby(['product_id', 'hour', 'day_of_week', 'category_code', 'brand']).agg({
    'price': 'sum',
    'user_session': 'count'
}).reset_index().rename(columns={'price': 'total_sales', 'user_session': 'num_purchases'})

# Train-test split
X = product_sales[['product_id', 'hour', 'day_of_week', 'category_code', 'brand']]
y = product_sales['total_sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical features using one-hot encoding
X_train = pd.get_dummies(X_train, columns=['category_code', 'brand'])
X_test = pd.get_dummies(X_test, columns=['category_code', 'brand'])

# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define the model
model = RandomForestRegressor(random_state=42, max_features='sqrt')

# Define hyperparameters to tune
param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Use RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Function to predict best times for a specific product
def predict_best_times(product_id):
    # Generate all combinations of hour and day_of_week for the specific product
    hours = range(24)
    days = range(7)

    # Create a DataFrame for all combinations
    combinations = list(product([product_id], hours, days))
    prediction_data = pd.DataFrame(combinations, columns=['product_id', 'hour', 'day_of_week'])

    # Add category_code and brand
    product_info = df[df['product_id'] == product_id][['product_id', 'category_code', 'brand']].drop_duplicates()
    prediction_data = prediction_data.merge(product_info, on='product_id', how='left')

    # Encode categorical features using one-hot encoding
    prediction_data_encoded = pd.get_dummies(prediction_data, columns=['category_code', 'brand'])

    # Ensure all columns are present in the prediction data
    prediction_data_encoded, _ = prediction_data_encoded.align(X_train, join='left', axis=1, fill_value=0)

    # Add missing columns with zeros using pd.concat
    missing_cols = list(set(X_train.columns) - set(prediction_data_encoded.columns))
    missing_cols_df = pd.DataFrame(0, index=prediction_data_encoded.index, columns=missing_cols)
    prediction_data_encoded = pd.concat([prediction_data_encoded, missing_cols_df], axis=1)
    prediction_data_encoded = prediction_data_encoded[X_train.columns]

    # Predict total sales for each combination
    prediction_data['predicted_sales'] = best_model.predict(prediction_data_encoded)

    # Find the best hour and day for advertising
    best_times = prediction_data.groupby(['hour', 'day_of_week'])['predicted_sales'].sum().reset_index()
    best_times = best_times.sort_values(by='predicted_sales', ascending=False)

    return best_times.head()

# Example usage
product_id = 2400054
best_times_for_product = predict_best_times(product_id)
print(best_times_for_product)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 138}
Mean Squared Error: 167222.92562194576
Root Mean Squared Error: 408.9289982649137
Mean Absolute Error: 182.90536581056992
R-squared: 0.7354427963246505
    hour  day_of_week  predicted_sales
41     5            6       298.453741
68     9            5       292.803466
34     4            6       292.347474
61     8            5       291.707513
76    10            6       289.061389


In [None]:
# Example usage
product_id = 3200361
best_times_for_product = predict_best_times(product_id)
print(best_times_for_product)

     hour  day_of_week  predicted_sales
48      6            6       413.172460
55      7            6       409.533834
104    14            6       398.943408
118    16            6       387.921330
111    15            6       386.494665


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import randint
from itertools import product
import numpy as np


# Group by product_id, hour, day_of_week, category_code, and brand, and aggregate data
product_sales = df.groupby(['product_id', 'hour', 'day_of_week', 'category_code', 'brand']).agg({
    'price': 'sum',
    'user_session': 'count'
}).reset_index().rename(columns={'price': 'total_sales', 'user_session': 'num_purchases'})

# Train-test split
X = product_sales[['product_id', 'hour', 'day_of_week', 'category_code', 'brand']]
y = product_sales['total_sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical features using one-hot encoding
X_train = pd.get_dummies(X_train, columns=['category_code', 'brand'])
X_test = pd.get_dummies(X_test, columns=['category_code', 'brand'])

# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define the model
model = RandomForestRegressor(random_state=42, max_features='sqrt')

# Define hyperparameters to tune
param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Use RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Function to predict the best time for a specific product
def predict_best_time(product_id):
    # Generate all combinations of hour and day_of_week for the specific product
    hours = range(24)
    days = range(7)

    # Create a DataFrame for all combinations
    combinations = list(product([product_id], hours, days))
    prediction_data = pd.DataFrame(combinations, columns=['product_id', 'hour', 'day_of_week'])

    # Add category_code and brand
    product_info = df[df['product_id'] == product_id][['product_id', 'category_code', 'brand']].drop_duplicates()
    prediction_data = prediction_data.merge(product_info, on='product_id', how='left')

    # Encode categorical features using one-hot encoding
    prediction_data_encoded = pd.get_dummies(prediction_data, columns=['category_code', 'brand'])

    # Ensure all columns are present in the prediction data
    prediction_data_encoded, _ = prediction_data_encoded.align(X_train, join='left', axis=1, fill_value=0)

    # Add missing columns with zeros using pd.concat
    missing_cols = list(set(X_train.columns) - set(prediction_data_encoded.columns))
    missing_cols_df = pd.DataFrame(0, index=prediction_data_encoded.index, columns=missing_cols)
    prediction_data_encoded = pd.concat([prediction_data_encoded, missing_cols_df], axis=1)
    prediction_data_encoded = prediction_data_encoded[X_train.columns]

    # Predict total sales for each combination
    prediction_data['predicted_sales'] = best_model.predict(prediction_data_encoded)

    # Find the best hour and day for advertising
    best_time = prediction_data.groupby(['hour', 'day_of_week'])['predicted_sales'].sum().reset_index()
    best_time = best_time.sort_values(by='predicted_sales', ascending=False).head(1)

    return best_time

# Example usage
product_id = 2400054
best_time_for_product = predict_best_time(product_id)
print(best_time_for_product)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 109}
Mean Squared Error: 166650.31175487785
Root Mean Squared Error: 408.2282593781056
Mean Absolute Error: 182.97054720761324
R-squared: 0.7363487075380429
    hour  day_of_week  predicted_sales
41     5            6       287.209258


In [None]:
product_id = 3200361
best_time_for_product = predict_best_time(product_id)
print(best_time_for_product)

     hour  day_of_week  predicted_sales
118    16            6       673.623529


In [None]:
import pickle

# Save the model and columns
model_data = {
    'model': best_model,
    'columns': X_train.columns
}

with open('best_time_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)


In [None]:
import pickle

# Assuming `best_model` and `X_train` are already defined
model_data = {
    'model': best_model,
    'columns': X_train.columns
}

with open('best_time_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

# Optionally, save the file to Google Drive for later use
from google.colab import drive
drive.mount('/content/drive')
!cp best_time_model.pkl /content/drive/MyDrive/best_time_model.pkl


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
import pickle

# Assuming the uploaded file is named 'your_training_data.csv'
train_data_path = list(model_data.keys())[0]  # Get the uploaded file name
train_data = pd.read_csv(train_data_path)

# Define your features and target
X_train = train_data[['day_of_week', 'hour', 'total_sales', 'num_purchases']]
y_train = train_data['target_column']  # Update with your target column name

# Train the model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# Save the trained model
model_path = "best_time_model.pkl"
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

print("Model retrained and saved successfully.")

# Download the model file
files.download(model_path)


####Power BI retrain model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import randint
from itertools import product
import numpy as np
import pickle
from google.colab import files

# Assuming your DataFrame is named 'product_sales' and is already loaded
# If it's not loaded, you can uncomment the following line to load it from a CSV
# product_sales = pd.read_csv('your_data.csv')

# Group by product_id, hour, day_of_week, category_code, and brand, and aggregate data
product_sales = df.groupby(['product_id', 'hour', 'day_of_week', 'category_code', 'brand']).agg({
    'price': 'sum',
    'user_session': 'count'
}).reset_index().rename(columns={'price': 'total_sales', 'user_session': 'num_purchases'})

# Train-test split
X = product_sales[['product_id', 'hour', 'day_of_week', 'category_code', 'brand']]
y = product_sales['total_sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical features using one-hot encoding
X_train = pd.get_dummies(X_train, columns=['category_code', 'brand'])
X_test = pd.get_dummies(X_test, columns=['category_code', 'brand'])

# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define the model
model = RandomForestRegressor(random_state=42, max_features='sqrt')

# Define hyperparameters to tune
param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Use RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Function to predict the best time for a specific product
def predict_best_time(product_id):
    # Generate all combinations of hour and day_of_week for the specific product
    hours = range(24)
    days = range(7)

    # Create a DataFrame for all combinations
    combinations = list(product([product_id], hours, days))
    prediction_data = pd.DataFrame(combinations, columns=['product_id', 'hour', 'day_of_week'])

    # Add category_code and brand
    product_info = product_sales[product_sales['product_id'] == product_id][['product_id', 'category_code', 'brand']].drop_duplicates()
    prediction_data = prediction_data.merge(product_info, on='product_id', how='left')

    # Encode categorical features using one-hot encoding
    prediction_data_encoded = pd.get_dummies(prediction_data, columns=['category_code', 'brand'])

    # Ensure all columns are present in the prediction data
    prediction_data_encoded, _ = prediction_data_encoded.align(X_train, join='left', axis=1, fill_value=0)

    # Add missing columns with zeros using pd.concat
    missing_cols = list(set(X_train.columns) - set(prediction_data_encoded.columns))
    missing_cols_df = pd.DataFrame(0, index=prediction_data_encoded.index, columns=missing_cols)
    prediction_data_encoded = pd.concat([prediction_data_encoded, missing_cols_df], axis=1)
    prediction_data_encoded = prediction_data_encoded[X_train.columns]

    # Predict total sales for each combination
    prediction_data['predicted_sales'] = best_model.predict(prediction_data_encoded)

    # Find the best hour and day for advertising
    best_time = prediction_data.groupby(['hour', 'day_of_week'])['predicted_sales'].sum().reset_index()
    best_time = best_time.sort_values(by='predicted_sales', ascending=False).head(1)

    return best_time

# Example usage
product_id = 2400054
best_time_for_product = predict_best_time(product_id)
print(best_time_for_product)
product_id = 3200361
best_time_for_product = predict_best_time(product_id)
print(best_time_for_product)

# Save the trained model
model_path = "best_time_model1.pkl"
with open(model_path, 'wb') as file:
    pickle.dump(best_model, file)

print("Model retrained and saved successfully.")

# Download the model file
files.download(model_path)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 106}
Mean Squared Error: 218536.00303569488
Root Mean Squared Error: 467.4783449911823
Mean Absolute Error: 213.7531021960665
R-squared: 0.6542622750410506
    hour  day_of_week  predicted_sales
97    13            6       237.616315
     hour  day_of_week  predicted_sales
104    14            6       337.434297
Model retrained and saved successfully.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import randint
from itertools import product
import numpy as np
import pickle

# Assuming your DataFrame is named 'df' and is already loaded
# If it's not loaded, you can uncomment the following line to load it from a CSV
# df = pd.read_csv('your_data.csv')

# Group by product_id, hour, day_of_week, category_code, and brand, and aggregate data
product_sales = df.groupby(['product_id', 'hour', 'day_of_week', 'category_code', 'brand']).agg({
    'price': 'sum',
    'user_session': 'count'
}).reset_index().rename(columns={'price': 'total_sales', 'user_session': 'num_purchases'})

# Train-test split
X = product_sales[['product_id', 'hour', 'day_of_week', 'category_code', 'brand']]
y = product_sales['total_sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical features using one-hot encoding
X_train = pd.get_dummies(X_train, columns=['category_code', 'brand'])
X_test = pd.get_dummies(X_test, columns=['category_code', 'brand'])

# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define the model
model = RandomForestRegressor(random_state=42, max_features='sqrt')

# Define hyperparameters to tune
param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Use RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Function to predict the best time for a specific product
def predict_best_time(product_id):
    # Generate all combinations of hour and day_of_week for the specific product
    hours = range(24)
    days = range(7)

    # Create a DataFrame for all combinations
    combinations = list(product([product_id], hours, days))
    prediction_data = pd.DataFrame(combinations, columns=['product_id', 'hour', 'day_of_week'])

    # Add category_code and brand
    product_info = product_sales[product_sales['product_id'] == product_id][['product_id', 'category_code', 'brand']].drop_duplicates()
    prediction_data = prediction_data.merge(product_info, on='product_id', how='left')

    # Encode categorical features using one-hot encoding
    prediction_data_encoded = pd.get_dummies(prediction_data, columns=['category_code', 'brand'])

    # Ensure all columns are present in the prediction data
    prediction_data_encoded, _ = prediction_data_encoded.align(X_train, join='left', axis=1, fill_value=0)

    # Add missing columns with zeros using pd.concat
    missing_cols = list(set(X_train.columns) - set(prediction_data_encoded.columns))
    missing_cols_df = pd.DataFrame(0, index=prediction_data_encoded.index, columns=missing_cols)
    prediction_data_encoded = pd.concat([prediction_data_encoded, missing_cols_df], axis=1)
    prediction_data_encoded = prediction_data_encoded[X_train.columns]

    # Predict total sales for each combination
    prediction_data['predicted_sales'] = best_model.predict(prediction_data_encoded)

    # Find the best hour and day for advertising
    best_time = prediction_data.groupby(['hour', 'day_of_week'])['predicted_sales'].sum().reset_index()
    best_time = best_time.sort_values(by='predicted_sales', ascending=False).head(1)

    return best_time

# Example usage
product_id = 2400054
best_time_for_product = predict_best_time(product_id)
print(best_time_for_product)
product_id = 3200361
best_time_for_product = predict_best_time(product_id)
print(best_time_for_product)

# Save the trained model
model_path = "best_time_model2.pkl"
with open(model_path, 'wb') as file:
    pickle.dump(best_model, file)

print("Model retrained and saved successfully.")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 135}
Mean Squared Error: 165441.5405769618
Root Mean Squared Error: 406.74505599572046
Mean Absolute Error: 182.6071249224832
R-squared: 0.738261059696238
    hour  day_of_week  predicted_sales
41     5            6       295.128416
     hour  day_of_week  predicted_sales
118    16            6       516.058523
Model retrained and saved successfully.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import randint
from itertools import product
import numpy as np
import pickle
from google.colab import files

# Assuming your DataFrame is named 'product_sales' and is already loaded

# Group by product_id, hour, day_of_week, category_code, and brand, and aggregate data
product_sales = df.groupby(['product_id', 'hour', 'day_of_week', 'category_code', 'brand']).agg({
    'price': 'sum',
    'user_session': 'count'
}).reset_index().rename(columns={'price': 'total_sales', 'user_session': 'num_purchases'})

# Train-test split
X = product_sales[['product_id', 'hour', 'day_of_week', 'category_code', 'brand']]
y = product_sales['total_sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical features using one-hot encoding
X_train = pd.get_dummies(X_train, columns=['category_code', 'brand'])
X_test = pd.get_dummies(X_test, columns=['category_code', 'brand'])

# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define the model
model = RandomForestRegressor(random_state=42, max_features='sqrt')

# Define hyperparameters to tune
param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': [10, 20, 30, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Use RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Predict and evaluate
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Save the trained model
model_path = "best_time_model2.pkl"
with open(model_path, 'wb') as file:
    pickle.dump(best_model, file)

print("Model retrained and saved successfully.")

# Download the model file
files.download(model_path)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'bootstrap': False, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 76}
Mean Squared Error: 176926.16159432573
Root Mean Squared Error: 420.6259164558524
Mean Absolute Error: 192.8426684459754
R-squared: 0.7200916656952386
Model retrained and saved successfully.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import sklearn
import numpy
print("scikit-learn version:", sklearn.__version__)
print("numpy version:", numpy.__version__)


scikit-learn version: 1.2.2
numpy version: 1.25.2


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from itertools import product
import pickle


# Group by product_id, hour, day_of_week, category_code, and brand, and aggregate data
product_sales = df.groupby(['product_id', 'hour', 'day_of_week', 'category_code', 'brand']).agg({
    'price': 'sum',
    'user_session': 'count'
}).reset_index().rename(columns={'price': 'total_sales', 'user_session': 'num_purchases'})

# Train-test split
X = product_sales[['product_id', 'hour', 'day_of_week', 'category_code', 'brand']]
y = product_sales['total_sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical features using one-hot encoding
X_train = pd.get_dummies(X_train, columns=['category_code', 'brand'])
X_test = pd.get_dummies(X_test, columns=['category_code', 'brand'])

# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define the model
model = RandomForestRegressor(random_state=42, max_features='sqrt')
model.fit(X_train, y_train)

# Save the trained model
model_path = 'best_time_model_precomputed.pkl'
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

# Function to predict the best time for a specific product
def predict_best_time(product_id, df, model, X_train):
    # Filter data for the specific product
    product_info = df[df['product_id'] == product_id][['product_id', 'category_code', 'brand']].drop_duplicates()

    # Generate all combinations of hour and day_of_week for the specific product
    hours = range(24)
    days = range(7)
    combinations = list(product([product_id], hours, days))

    # Create a DataFrame for all combinations
    prediction_data = pd.DataFrame(combinations, columns=['product_id', 'hour', 'day_of_week'])

    # Merge with product_info to get category_code and brand
    prediction_data = prediction_data.merge(product_info, on='product_id', how='left')

    # Encode categorical features using one-hot encoding
    prediction_data_encoded = pd.get_dummies(prediction_data, columns=['category_code', 'brand'])

    # Ensure all columns are present in the prediction data
    for col in X_train.columns:
        if col not in prediction_data_encoded.columns:
            prediction_data_encoded[col] = 0

    # Reorder columns to match the training data
    prediction_data_encoded = prediction_data_encoded[X_train.columns]

    # Predict total sales for each combination
    prediction_data['predicted_sales'] = model.predict(prediction_data_encoded)

    # Find the best hour and day for advertising
    best_time = prediction_data.loc[prediction_data['predicted_sales'].idxmax()]

    return best_time

# Precompute the best time for all products
all_best_times = []

for product_id in df['product_id'].unique():
    best_time_for_product = predict_best_time(product_id, df, model, X_train)
    all_best_times.append({
        'product_id': product_id,
        'best_hour': best_time_for_product['hour'],
        'best_day': best_time_for_product['day_of_week'],
        'predicted_sales': best_time_for_product['predicted_sales']
    })

# Save the precomputed results to a CSV file
precomputed_results = pd.DataFrame(all_best_times)
precomputed_results.to_csv('best_times_precomputed.csv', index=False)

print("Precomputation complete. Results saved to best_times_precomputed.csv.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded

Precomputation complete. Results saved to best_times_precomputed.csv.


  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0
  prediction_data_encoded[col] = 0


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from itertools import product
import pickle

# Group by product_id, hour, day_of_week, category_code, and brand, and aggregate data
product_sales = df.groupby(['product_id', 'hour', 'day_of_week', 'category_code', 'brand']).agg({
    'price': 'sum',
    'user_session': 'count'
}).reset_index().rename(columns={'price': 'total_sales', 'user_session': 'num_purchases'})

# Train-test split
X = product_sales[['product_id', 'hour', 'day_of_week', 'category_code', 'brand']]
y = product_sales['total_sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical features using one-hot encoding
X_train = pd.get_dummies(X_train, columns=['category_code', 'brand'])
X_test = pd.get_dummies(X_test, columns=['category_code', 'brand'])

# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define the model
model = RandomForestRegressor(random_state=42, max_features='sqrt')

# Train the model
model.fit(X_train, y_train)

# Save the trained model
model_path = "best_time_model2.pkl"
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

print("Model retrained and saved successfully.")

def predict_best_time(product_id, df, model, X_train):
    # Filter data for the specific product
    product_info = df[df['product_id'] == product_id][['product_id', 'category_code', 'brand']].drop_duplicates()

    # Generate all combinations of hour and day_of_week for the specific product
    hours = range(24)
    days = range(7)
    combinations = list(product([product_id], hours, days))

    # Create a DataFrame for all combinations
    prediction_data = pd.DataFrame(combinations, columns=['product_id', 'hour', 'day_of_week'])

    # Merge with product_info to get category_code and brand
    prediction_data = prediction_data.merge(product_info, on='product_id', how='left')

    # Encode categorical features using one-hot encoding
    prediction_data_encoded = pd.get_dummies(prediction_data, columns=['category_code', 'brand'])

    # Ensure all columns are present in the prediction data by adding missing columns
    missing_cols = list(set(X_train.columns) - set(prediction_data_encoded.columns))
    missing_cols_df = pd.DataFrame(0, index=prediction_data_encoded.index, columns=missing_cols)
    prediction_data_encoded = pd.concat([prediction_data_encoded, missing_cols_df], axis=1)

    # Reorder columns to match the training data
    prediction_data_encoded = prediction_data_encoded[X_train.columns]

    # Predict total sales for each combination
    prediction_data['predicted_sales'] = model.predict(prediction_data_encoded)

    # Find the best hour and day for advertising
    best_time = prediction_data.loc[prediction_data['predicted_sales'].idxmax()]

    return best_time['hour'], best_time['day_of_week'], best_time['predicted_sales']

# Load the model
with open("best_time_model2.pkl", 'rb') as file:
    model = pickle.load(file)

# Prepare to save the results
results = []

# Get unique product_ids
product_ids = df['product_id'].unique()

# Compute the best time for each product_id
for product_id in product_ids:
    best_hour, best_day, predicted_sales = predict_best_time(product_id, df, model, X_train)
    results.append({'product_id': product_id, 'best_hour': best_hour, 'best_day': best_day, 'predicted_sales': predicted_sales})

# Convert to DataFrame
results_df = pd.DataFrame(results)


Model retrained and saved successfully.


In [None]:
results_df.head(400)

Unnamed: 0,product_id,best_hour,best_day,predicted_sales
0,6200687,3,6,93.8377
1,3200361,16,6,558.4485
2,3100152,14,6,180.5777
3,3601485,6,1,7844.8751
4,3700127,11,6,3815.1787
5,3601244,10,4,3891.9521
6,2800403,5,6,511.1461
7,3900045,15,4,280.8151
8,2700920,12,5,1442.1368
9,3601405,5,6,5762.8664


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from itertools import product
import pickle

# Group by product_id, hour, day_of_week, category_code, and brand, and aggregate data
product_sales = df.groupby(['product_id', 'hour', 'day_of_week', 'category_code', 'brand']).agg({
    'price': 'sum',
    'user_session': 'count'
}).reset_index().rename(columns={'price': 'total_sales', 'user_session': 'num_purchases'})

# Train-test split
X = product_sales[['product_id', 'hour', 'day_of_week', 'category_code', 'brand']]
y = product_sales['total_sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode categorical features using one-hot encoding
X_train = pd.get_dummies(X_train, columns=['category_code', 'brand'])
X_test = pd.get_dummies(X_test, columns=['category_code', 'brand'])

# Ensure train and test sets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Define the model
model = RandomForestRegressor(random_state=42, max_features='sqrt')

# Train the model
model.fit(X_train, y_train)

# Save the trained model
model_path = "best_time_model2.pkl"
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

print("Model retrained and saved successfully.")

def predict_best_time(product_id, df, model, X_train):
    # Filter data for the specific product
    product_info = df[df['product_id'] == product_id][['product_id', 'category_code', 'brand']].drop_duplicates()

    # Generate all combinations of hour and day_of_week for the specific product
    hours = range(24)
    days = range(7)
    combinations = list(product([product_id], hours, days))

    # Create a DataFrame for all combinations
    prediction_data = pd.DataFrame(combinations, columns=['product_id', 'hour', 'day_of_week'])

    # Merge with product_info to get category_code and brand
    prediction_data = prediction_data.merge(product_info, on='product_id', how='left')

    # Encode categorical features using one-hot encoding
    prediction_data_encoded = pd.get_dummies(prediction_data, columns=['category_code', 'brand'])

    # Ensure all columns are present in the prediction data by adding missing columns
    missing_cols = list(set(X_train.columns) - set(prediction_data_encoded.columns))
    missing_cols_df = pd.DataFrame(0, index=prediction_data_encoded.index, columns=missing_cols)
    prediction_data_encoded = pd.concat([prediction_data_encoded, missing_cols_df], axis=1)

    # Reorder columns to match the training data
    prediction_data_encoded = prediction_data_encoded[X_train.columns]

    # Predict total sales for each combination
    prediction_data['predicted_sales'] = model.predict(prediction_data_encoded)

    # Find the best hour and day for advertising
    best_time = prediction_data.loc[prediction_data['predicted_sales'].idxmax()]

    return best_time['hour'], best_time['day_of_week'], best_time['predicted_sales']

# Load the model
with open("best_time_model2.pkl", 'rb') as file:
    model = pickle.load(file)

# Prepare to save the results
results = []

# Get unique product_ids
product_ids = df['product_id'].unique()

# Compute the best time for each product_id
for product_id in product_ids:
    best_hour, best_day, predicted_sales = predict_best_time(product_id, df, model, X_train)
    category_code = df[df['product_id'] == product_id]['category_code'].values[0]
    brand = df[df['product_id'] == product_id]['brand'].values[0]
    price = df[df['product_id'] == product_id]['price'].values[0]  # Get the price of one product
    results.append({
        'product_id': product_id,
        'best_hour': best_hour,
        'best_day': best_day,
        'predicted_sales': predicted_sales,
        'category_code': category_code,
        'brand': brand,
        'price': price
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Save the results DataFrame to a CSV file
results_df.to_csv('best_time_predictions.csv', index=False)

print("Results saved to 'best_time_predictions.csv'.")


Model retrained and saved successfully.
Results saved to 'best_time_predictions.csv'.


In [None]:
results_df.head(400)

Unnamed: 0,product_id,best_hour,best_day,predicted_sales,category_code,brand,price
0,6200687,3,6,93.8377,environment.air_heater,oasis,28.03
1,3200361,16,6,558.4485,kitchen.meat_grinder,philips,120.95
2,3100152,14,6,180.5777,kitchen.blender,panasonic,51.46
3,3601485,6,1,7844.8751,kitchen.washer,lg,308.65
4,3700127,11,6,3815.1787,environment.vacuum,lg,189.26
5,3601244,10,4,3891.9521,kitchen.washer,lg,339.75
6,2800403,5,6,511.1461,kitchen.refrigerators,,204.33
7,3900045,15,4,280.8151,environment.water_heater,thermex,84.94
8,2700920,12,5,1442.1368,kitchen.refrigerators,midea,810.81
9,3601405,5,6,5762.8664,kitchen.washer,beko,187.88


In [None]:
# Save the results DataFrame to a CSV file in Google Drive
results_df.to_csv('/content/drive/My Drive/best_time_predictions.csv', index=False)

In [None]:
import pandas as pd

# Assuming results_df is already defined
# Map day_of_week numbers to day names
day_map = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
results_df['best_day'] = results_df['best_day'].map(day_map)

# Update the price to reflect the price of one product
results_df['price'] = results_df['product_id'].apply(lambda x: df[df['product_id'] == x]['price'].values[0])

# Save the results DataFrame to a CSV file
results_df.to_csv('C:\\Path\\To\\best_time_predictions.csv', index=False)

print("Results saved to 'best_time_predictions.csv'.")

Results saved to 'best_time_predictions.csv'.


In [None]:
# Save the results DataFrame to a CSV file in Google Drive
results_df.head()

Unnamed: 0,product_id,best_hour,best_day,predicted_sales,category_code,brand,price
0,6200687,3,Sunday,93.8377,environment.air_heater,oasis,28.03
1,3200361,16,Sunday,558.4485,kitchen.meat_grinder,philips,120.95
2,3100152,14,Sunday,180.5777,kitchen.blender,panasonic,51.46
3,3601485,6,Tuesday,7844.8751,kitchen.washer,lg,308.65
4,3700127,11,Sunday,3815.1787,environment.vacuum,lg,189.26


In [None]:
# Save the results DataFrame to a CSV file in Google Drive
results_df.to_csv('/content/drive/My Drive/best_time_predictions2.csv', index=False)