In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier

from joblib import Parallel, delayed
from tqdm.auto import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# EDA - Holiday

In [None]:
df_holiday=pd.read_csv("../input/store-sales-time-series-forecasting/holidays_events.csv")
df_holiday

In [None]:
df_holiday.info()

In [None]:
df_holiday.shape

In [None]:
df_holiday.isnull().sum()

In [None]:
df_holiday['date'].value_counts()

In [None]:
df_holiday['type'].value_counts()

In [None]:
df_holiday['locale'].value_counts()

In [None]:
df_holiday['locale_name'].value_counts()

In [None]:
df_holiday['description'].value_counts()

In [None]:
df_holiday['transferred'].value_counts()

In [None]:
x_df = df_holiday['transferred'].value_counts()
x_df.plot.pie(explode = [0,0.1], autopct = '%1.1f%%', shadow = False)
plt.title("Distribution of transferred column")
plt.ylabel('')
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (18, 5))
sns.countplot(ax = axes[0], data = df_holiday, x = 'locale', hue = 'transferred')
axes[0].set_title('Transferred Holidays')
sns.countplot(ax = axes[1], data = df_holiday, x = 'type')
axes[1].set_title('Type of day')

# EDA - Oil

In [None]:
df_oil=pd.read_csv("../input/store-sales-time-series-forecasting/oil.csv")
df_oil

In [None]:
df_oil.info()

In [None]:
df_oil.shape

In [None]:
df_oil.isnull().sum()

In [None]:
df_oil=df_oil.fillna(method="backfill")
df_oil.isnull().sum()

# EDA - Store

In [None]:
df_store=pd.read_csv("../input/store-sales-time-series-forecasting/stores.csv")
df_store.head()

In [None]:
df_store.shape

In [None]:
df_store.info()

In [None]:
df_store.isnull().sum()

In [None]:
df_store['city'].value_counts()

In [None]:
df_store['state'].value_counts()

In [None]:
df_store['type'].value_counts()

In [None]:
df_store['cluster'].value_counts()

# EDA - Transcations

In [None]:
df_tran=pd.read_csv("../input/store-sales-time-series-forecasting/transactions.csv")
df_tran

In [None]:
df_tran.shape

In [None]:
df_tran.info()

In [None]:
df_tran.isnull().sum()

In [None]:
df_tran['date'].value_counts()

In [None]:
df_tran['store_nbr'].value_counts()

# EDA - Test

In [None]:
df_test=pd.read_csv("../input/store-sales-time-series-forecasting/test.csv")
df_test

In [None]:
df_train=pd.read_csv("../input/store-sales-time-series-forecasting/train.csv")
df_train

In [None]:
df_submission=pd.read_csv("../input/store-sales-time-series-forecasting/sample_submission.csv")
df_submission

# Training and Testing Data Cleaning

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train['date']= pd.to_datetime(df_train['date'],format="%Y-%m-%d")
df_train['year']=df_train['date'].apply(lambda X: X.year)
df_train['month']=df_train['date'].apply(lambda X: X.month)
df_train['day']=df_train['date'].apply(lambda X: X.day)
df_train=df_train.drop(['date'],axis=1)

df_test['date']= pd.to_datetime(df_test['date'],format="%Y-%m-%d")
df_test['year']=df_test['date'].apply(lambda X: X.year)
df_test['month']=df_test['date'].apply(lambda X: X.month)
df_test['day']=df_test['date'].apply(lambda X: X.day)
df_test=df_test.drop(['date'],axis=1)

In [None]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(df_train['family'])
df_train['family']= label_encoder.transform(df_train['family'])

df_test['family']= label_encoder.transform(df_test['family'])

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
df_train.describe()

In [None]:
df_test.describe()

In [None]:
y_train=df_train['sales']
X_train=df_train.drop(['id','sales'],axis=1)
X_test=df_test.drop(['id'],axis=1)
X_train.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

# min max scaling the variables
scaler =  MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train.shape, X_test.shape

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# training of linear regression model
regressor = LinearRegression()
regressor.fit(X_train,y_train)

y_test = regressor.predict(X_test)
LR_CSV = pd.DataFrame({'id':df_test['id'], 'sales':y_test})
LR_CSV.to_csv('LR_CSV.csv', index=False)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators':[100],'max_features':[6],'max_depth':range(1,100), 'min_samples_leaf':[100],'random_state':[42]} 
rf = RandomForestRegressor(n_jobs=-1)
rf_cv =  GridSearchCV(rf, param_grid)
rf_cv.fit(X_train,y_train)
print(rf_cv.best_params_)

y_test=rf_cv.predict(X_test)
RF_CSV = pd.DataFrame({'id':df_test['id'], 'sales':y_test})
RF_CSV.to_csv('RF_CSV.csv', index=False)

# Ridge regression

In [None]:
from sklearn.linear_model import Ridge

ridgeR = Ridge(alpha =0.5)
ridgeR.fit(X_train,y_train)

y_test = ridgeR.predict(X_test)
RIDGE_CSV = pd.DataFrame({'id':df_test['id'], 'sales':y_test})
RIDGE_CSV.to_csv('RIDGE_CSV.csv', index=False)

# Lasso regression

In [None]:
from sklearn.linear_model import Lasso 

lasso = Lasso(alpha =0.05)
lasso.fit(X_train,y_train)

y_test = lasso.predict(X_test)
LASSO_CSV = pd.DataFrame({'id':df_test['id'], 'sales':y_test})
LASSO_CSV.to_csv('LASSO_CSV.csv', index=False)

# SGDRegressor

In [None]:
from sklearn import linear_model

SGDr = linear_model.SGDRegressor()
SGDr.fit(X_train, y_train)

y_test = SGDr.predict(X_test)
SGD_CSV = pd.DataFrame({'id':df_test['id'], 'sales':y_test})
SGD_CSV.to_csv('SGD_CSV.csv', index=False)

# Support Vector Regression

In [None]:
from sklearn.svm import SVR 

SVRregressor = SVR() 
SVRregressor.fit(X_train, y_train)

y_test = SVRregressor.predict(X_test)
SVR_CSV = pd.DataFrame({'id':df_test['id'], 'sales':y_test})
SVR_CSV.to_csv('SVR_CSV.csv', index=False)