In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns



In [2]:
calendar = pd.read_csv('calendar.csv')
calendar_events = pd.read_csv('calendar_events.csv')
sales_test = pd.read_csv('sales_test.csv')
sales_train = pd.read_csv('sales_train.csv')
items = pd.read_csv('items_weekly_sell_prices.csv')

In [18]:
# Columns to keep as identifiers
id_cols = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

def getdatefeatures(df):
    # convert to pandas datetime and set index as date
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)

    # getting date features
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day_of_week'] = df.index.dayofweek
    df['day_of_year'] = df.index.dayofyear

    return df

def makedf(data):
    
    df = pd.melt(data, id_vars=id_cols, var_name='d', value_name='qty')
    df = pd.merge(df, calendar, how='left', on='d')
    df = pd.merge(df, items, how='left', on=['store_id',	'item_id',	'wm_yr_wk'])
    df = pd.merge(df, calendar_events, how='left', on='date')
    df['sell_price'].fillna(0, inplace=True)
    df['revenue'] = df['qty'] * df['sell_price']

    df = getdatefeatures(df)

    return df

sales_test = pd.concat([sales_train[id_cols], sales_test], axis=1)

train_df = makedf(sales_train)
test_df = makedf(sales_test)

In [20]:
features = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'year', 'month', 'day_of_week', 'day_of_year']
# , 'sell_price', 'qty', 'wm_yr_wk', 'event_name', 'event_type'

# Select features and target
X_train = train_df[features]
y_train = train_df['revenue']
X_test = test_df[features]
y_test = test_df['revenue'] 

In [10]:
def fitmodel(model, model_name, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):

    # Define categorical and numeric columns for preprocessing
    categorical_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
    numeric_cols = ['year',  'month', 'day_of_week', 'day_of_year']

    # Preprocessing and modeling pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', numeric_cols), 
            ('cat', OneHotEncoder(), categorical_cols)
        ])

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)

    # Assuming you have a scikit-learn pipeline named 'pipeline'
    joblib.dump(pipeline, f'{model_name}.pkl')

    # Evaluate the model
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    return model_name, rmse, r2, mae, mape

Untitled-1.ipynb              items_weekly_sell_prices.csv
calendar.csv                  sales_test.csv
calendar_events.csv           sales_train.csv
[34mdeployment[m[m/
