# Import Libraries

In [25]:
import pandas as pd
import numpy as np
from cstm_pkg_grp_9.data.sets import pop_target
from sklearn.metrics import make_scorer, root_mean_squared_error as rmse
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression

# Loading Data

In [2]:
train_df = pd.read_csv("../../data/processed/train_processed_1.csv")
test_df = pd.read_csv("../../data/processed/test_processed_1.csv")

In [3]:
train_df.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,year,month,day,sales_revenue
0,-0.328719,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,5.52
1,-0.454687,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,3.12
2,-0.392039,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,0.0
3,-0.244169,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,0.0
4,-0.158854,-0.096686,0.128218,-1.574645,-1.092484,-1.636141,-1.538818,1.516152,2.8


In [4]:
test_df.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,year,month,day,sales_revenue
0,-0.20372,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,0.0
1,-0.534879,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,0.0
2,-0.658027,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,0.0
3,0.976588,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,18.56
4,-0.332285,-0.096686,0.128218,-1.574645,-1.092484,1.66924,-0.679833,0.378393,8.64


# Sampling Data

In [5]:
df_train_sample = train_df.sample(frac=0.6, random_state=42)
df_test_sample = test_df

In [6]:
features_train, target_train = pop_target(df_train_sample, 'sales_revenue')
X_test, y_test = pop_target(df_test_sample, 'sales_revenue')

# Splitting Data

In [7]:
X_train, X_val, y_train, y_val = train_test_split(features_train, target_train, test_size=0.3, random_state=42)

# Baseline Model

In [8]:
y_mean = y_train.mean()
y_mean

np.float64(4.100991207383548)

In [9]:
y_base = np.full(y_train.shape, y_mean)
print("RMSE on Training Data:", rmse(y_train, y_base))

RMSE on Training Data: 10.485546198416937


In [10]:
y_val_base = np.full(y_val.shape, y_mean)
print("RMSE on Validation Data:", rmse(y_val, y_val_base))

RMSE on Validation Data: 10.421423291969589


In [11]:
y_test_base = np.full(y_test.shape, y_mean)
print("RMSE on Testing Data:", rmse(y_test, y_test_base))

RMSE on Testing Data: 11.313842329627265


# Modelling

## Linear Regression Model 1

Training the data on a regular linear regression model

In [12]:
linear_reg_1 = LinearRegression()

In [13]:
linear_reg_1.fit(X_train, y_train)

In [14]:
y_train_pred = linear_reg_1.predict(X_train)
train_rmse = rmse(y_train, y_train_pred)
print(train_rmse)

8.816224522479516


In [15]:
y_val_pred = linear_reg_1.predict(X_val)
val_rmse = rmse(y_val, y_val_pred)
print(val_rmse)

8.755906411582133


## Linear Regression Model 2

Using gridsearch CV, creating folds of the data and evaluating different combinations of parameters.

In [22]:
param_grid = {
    'fit_intercept': [True, False],
    'n_jobs': [-1],
}

In [23]:
linear_reg_2 = LinearRegression()

In [28]:
grid_search = GridSearchCV(estimator=linear_reg_2, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

In [29]:
grid_search.fit(X_train, y_train)

In [32]:
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'fit_intercept': True, 'n_jobs': -1}


In [31]:
y_train_pred = grid_search.predict(X_train)
train_rmse = rmse(y_train, y_train_pred)
print(train_rmse)

8.816224522479516


In [33]:
y_val_pred = grid_search.predict(X_val)
val_rmse = rmse(y_val, y_val_pred)
print(val_rmse)

8.755906411582133


## Test results

In [34]:
y_test_pred = grid_search.predict(X_test)
test_rmse = rmse(y_test, y_test_pred)
print(test_rmse)

9.753503945408692
