In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, make_scorer
import matplotlib.pyplot as plt

In [None]:
excel_file ="salesdaily-data.xlsx"
column_name = "R06"


df = pd.read_excel(excel_file)
df['datetime']=pd.to_datetime(df['datetime'])
df['Month']=df['datetime'].dt.month
med1data=df[['R06','Month']]

for i in range (1,2):
    med1data[f'Lag{i}']=med1data['R06'].shift(i)
med1data.dropna(inplace=True)
print(med1data.tail())

x=med1data.drop(columns='R06')
y=med1data['R06']




In [None]:


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# If scaling y is needed
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1)).flatten()

# Define the Random Forest model and parameter grid for cross-validation
rf = RandomForestRegressor(random_state=42)

param_dist = {
    'n_estimators': [50, 100, 150, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Define a custom scorer for RMSE
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)

# Set up TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=10)

# Set up RandomizedSearchCV with TimeSeriesSplit cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, scoring=rmse_scorer, cv=tscv, n_jobs=-1, verbose=2, random_state=42, n_iter=50)

# Fit the model using the scaled training data
random_search.fit(X_train_scaled, y_train_scaled)

# Get the best model
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# If y was scaled, inverse transform the predictions
y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1)).flatten()

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Test RMSE: {rmse}')

# Define a function to calculate MAPE excluding zero values
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_indices = y_true != 0
    y_true, y_pred = y_true[non_zero_indices], y_pred[non_zero_indices]
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# Calculate MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'Test MAPE: {mape}')



# Plot the results
plt.figure(figsize=(12, 6))
plt.plot(y_test.values[:60], label='Actual Sales')
plt.plot(y_pred[:60], label='Predicted Sales')
plt.legend()
plt.title('Predictions vs Actuals for drug R03')
plt.xlabel('Weeks')
plt.ylabel('Value')
plt.show()
