In [None]:
#2.1 Preprocessing

import pandas as pd

train = pd.read_csv('train.csv', parse_dates=['date'])
test = pd.read_csv('test.csv', parse_dates=['date'])


In [None]:
#Feature Extraction:
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
df['is_beginning_of_month'] = df['Date'].dt.day <= 7
df['is_mid_month'] = df['Date'].dt.day.between(8, 14)
df['is_end_of_month'] = df['Date'].dt.day > 24

In [None]:
#Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features = ['DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 
             'is_beginning_of_month', 'is_mid_month', 'is_end_of_month']
df[features] = scaler.fit_transform(df[features])


# Building Models with sklearn Pipelines

In [None]:
#Create Pipelines
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), ['DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 
             'is_beginning_of_month', 'is_mid_month', 'is_end_of_month'])
    ]
)

# Define model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])


In [None]:
#Train the Model
X = df.drop(['Id', 'Date'], axis=1)  # Exclude non-feature columns
y = df['Sales']  # Assuming you have a 'Sales' column in your full dataset

model.fit(X, y)

Post-Prediction Analysis

In [None]:
#Feature Importance
importances = model.named_steps['regressor'].feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
print(importance_df.sort_values(by='Importance', ascending=False))


In [None]:
#Confidence Intervals useing bootstrapping to estimate confidence intervals for predictions
import numpy as np

def bootstrap_predictions(model, X, y, n_iterations=1000):
    predictions = []
    for _ in range(n_iterations):
        indices = np.random.choice(len(X), len(X), replace=True)
        X_boot = X.iloc[indices]
        y_boot = y.iloc[indices]
        model.fit(X_boot, y_boot)
        preds = model.predict(X)
        predictions.append(preds)
    predictions = np.array(predictions)
    return np.percentile(predictions, [2.5, 97.5], axis=0)  # 95% CI

ci = bootstrap_predictions(model, X, y)



#Serialize Models

In [None]:
#Save Model
import joblib
from datetime import datetime

timestamp = datetime.now().strftime('%d-%m-%Y-%H-%M-%S-%f')
joblib.dump(model, f'model_{timestamp}.pkl')


In [None]:
#Building Model with Deep Learning
#Prepare Data for LSTM
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

scaler = MinMaxScaler(feature_range=(-1, 1))
scaled_data = scaler.fit_transform(df[['Sales']])  # Assuming 'Sales' is the target

generator = TimeseriesGenerator(scaled_data, scaled_data, length=14, batch_size=1)


In [None]:
#Build LSTM Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(14, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

model.fit(generator, epochs=10)


In [None]:
#Evaluate & Predict
predictions = model.predict(generator)
