<a href="https://colab.research.google.com/github/sapna-90/Project_6_Pharmaceuticals-sales-prediction/blob/main/Pharmaceuticals_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Data Preprocessing**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
Dataset = pd.read_csv(r"/content/Project_6_cleaned_data.csv")

In [3]:
Dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 382332 entries, 0 to 382331
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Store                      382332 non-null  int64  
 1   DayOfWeek                  382332 non-null  int64  
 2   Date                       382332 non-null  object 
 3   Sales                      382332 non-null  int64  
 4   Customers                  382332 non-null  int64  
 5   Open                       382332 non-null  int64  
 6   Promo                      382332 non-null  int64  
 7   StateHoliday               382332 non-null  object 
 8   SchoolHoliday              382332 non-null  int64  
 9   StoreType                  382332 non-null  object 
 10  Assortment                 382332 non-null  object 
 11  CompetitionDistance        382332 non-null  float64
 12  CompetitionOpenSinceMonth  382332 non-null  float64
 13  CompetitionOpenSinceYear   38

In [4]:
numerical_cols = Dataset.select_dtypes(include=['float64', 'int64']).columns

# Iterate through each numerical column and fill missing values
for col in numerical_cols:
    Dataset[col] = Dataset[col].fillna(Dataset[col].mean(0))

In [5]:
Dataset.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
Store,0
DayOfWeek,0
Promo2SinceYear,0
Promo2SinceWeek,0
Promo2,0
CompetitionOpenSinceYear,0
CompetitionOpenSinceMonth,0
CompetitionDistance,0
Assortment,0
StoreType,0


In [6]:
# Encode categorical variables
label_encoder = LabelEncoder()
Dataset['StoreType'] = label_encoder.fit_transform(Dataset['StoreType'])
Dataset['Assortment'] = label_encoder.fit_transform(Dataset['Assortment'])

# Convert 'StateHoliday' column to string type before encoding
Dataset['StateHoliday'] = Dataset['StateHoliday'].astype(str)
Dataset['StateHoliday'] = label_encoder.fit_transform(Dataset['StateHoliday'])

In [7]:
# Create date-based features
Dataset['IsWeekend'] = Dataset['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
Dataset['IsHoliday'] = Dataset['StateHoliday'].apply(lambda x: 1 if x != 0 else 0)

In [8]:
# Split features and target
X = Dataset.drop(['Sales', 'Customers', 'Date'], axis=1)
y = Dataset['Sales']

In [9]:
print(X.columns)

Index(['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'IsWeekend',
       'IsHoliday'],
      dtype='object')


In [10]:
# Create a new column 'HasPromo' with binary values indicating if a promotion is running
Dataset['HasPromo'] = Dataset['PromoInterval'].apply(lambda x: 0 if pd.isnull(x) else 1)

# One-hot encode the 'PromoInterval' column for months if a promotion is running
promo_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
for month in promo_months:
    Dataset[f'Promo_{month}'] = Dataset['PromoInterval'].apply(lambda x: 1 if isinstance(x, str) and month in x.split(',') else 0)

# Drop the original 'PromoInterval' column as it is no longer needed
Dataset = Dataset.drop('PromoInterval', axis=1)


# Split features and target
X = Dataset.drop(['Sales', 'Customers', 'Date'], axis=1)
y = Dataset['Sales']

In [11]:
# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [12]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

**Machine learning model creation with SKlearn pipelines**

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

In [14]:
# Creating the pipeline with a RandomForest model
model_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [15]:
# Training the model
model_pipeline.fit(X_train, y_train)

In [16]:
# Predict and evaluate the model

from sklearn.metrics import mean_squared_error, r2_score
predictions = model_pipeline.predict(X_test)
print(f" R2 Score {r2_score(predictions, y_test)}")
print(f" mean_squared_error {mean_squared_error(predictions, y_test)}")
print(f" Root mean_squared_error {np.sqrt(mean_squared_error(predictions, y_test))}")
print(f" mean_absolute_error {mean_absolute_error(predictions, y_test)}")

 R2 Score 0.924396563482687
 mean_squared_error 807261.1992523202
 Root mean_squared_error 898.4771556652513
 mean_absolute_error 578.2142783083791


In [17]:
# Save the model
import joblib
joblib.dump(model_pipeline, 'rf_model.pkl')

['rf_model.pkl']

**Creating a DeepLearning Model with LSTM for TimeSeries Forecasting**

In [18]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [19]:
# Reshape data for LSTM: [samples, timesteps, features]
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [20]:
# Build LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(1))

In [21]:
model.compile(optimizer='adam', loss='mse')
history = model.fit(X_train_lstm, y_train, epochs=15, batch_size=32, validation_data=(X_test_lstm, y_test), verbose=1)

Epoch 1/15
[1m9559/9559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 3ms/step - loss: 26644212.0000 - val_loss: 3691068.2500
Epoch 2/15
[1m9559/9559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 3ms/step - loss: 4152879.2500 - val_loss: 3347001.2500
Epoch 3/15
[1m9559/9559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 3ms/step - loss: 3929483.7500 - val_loss: 3226106.2500
Epoch 4/15
[1m9559/9559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 3ms/step - loss: 3813503.5000 - val_loss: 3129705.7500
Epoch 5/15
[1m9559/9559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 3ms/step - loss: 3724737.0000 - val_loss: 3028303.5000
Epoch 6/15
[1m9559/9559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 3ms/step - loss: 3632935.0000 - val_loss: 2936380.2500
Epoch 7/15
[1m9559/9559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 3ms/step - loss: 3557544.2500 - val_loss: 2843727.2500
Epoch 8/15
[1m9559/9559[0m [32m━━━━━━━━━━━━━━━━━━━━

In [22]:
# Predict with LSTM
lstm_predictions = model.predict(X_test_lstm)
lstm_mae = mean_absolute_error(y_test, lstm_predictions)
print(f"LSTM Mean Absolute Error: {lstm_mae}")
print(f'R2 Score {r2_score(lstm_predictions, y_test)}')

[1m2390/2390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step
LSTM Mean Absolute Error: 1081.2143629773186
R2 Score 0.7178957462310791


In [23]:
# Save the LSTM model in keras format

model.save('lstm_model.keras')