In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR as sk_SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
import cuml
import cupy as cp
from cuml.linear_model import LinearRegression
from cuml.linear_model import ElasticNet
from cuml.svm import SVR as cu_SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import time
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
sales_train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
item_categories = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')


In [None]:
sales_train.head(5)

In [None]:
sales_train.describe()

In [None]:
sales_train.info()

In [None]:
# Check Missing Data
sales_train.isna().sum()

In [None]:
#reformat the data
sales_train = sales_train.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_price':'mean','item_cnt_day':'sum'}).reset_index()
# disp The Data
sales_train.head(5)

In [None]:
plt.figure(figsize=(12,8))
dataplot = sns.heatmap(sales_train.corr(), cmap="YlGnBu", annot=True)
plt.show()

In [None]:
# Prepare The Test Data as Training Data
test['date_block_num'] = 34
test_data=test[['date_block_num','shop_id','item_id']]
test_data.head()

In [None]:
item_price=dict(sales_train.groupby('item_id')['item_price'].last().reset_index().values)
test_data['item_price']=test_data.item_id.map(item_price)
print(test_data)
print(sales_train)

In [None]:
# Fill Missing Features in Testing Data
test_data['item_price'] = test_data['item_price'].fillna(test_data['item_price'].median())
print(test_data)

In [None]:
# There Are Too Many Records, i'll Take Sample
# i'll take a small Sample Because SVM Takes Very loooooooooooooooooooooooooooooong Time
sales_train = sales_train.sample(frac = .3,random_state=98) 
print(sales_train)

In [None]:
# to Save All accuracies
performance = pd.DataFrame(columns=['Model Name','RMSE','Time'],)

# Using SkLearn

In [None]:
sc = StandardScaler()

In [None]:
X = np.array(sales_train.drop(['item_cnt_day'], axis =1 ))
Y = np.array(sales_train.iloc[:,4])
X = sc.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 25 )

## **Linear Reg**

In [None]:
%%time
start_time = time.time()
# Define The Model
regressor = LinearRegression()
# Fitting The Model
regressor.fit(X_train,Y_train)
#Prediction The Test Set Results
y_pred = regressor.predict(X_test)
# Calculate The RMSE
rmse = mean_squared_error(Y_test,y_pred,squared=False)
total_time= time.time() - start_time
rmse

In [None]:
newResult = {'Model Name':'linear_sklearn','RMSE':rmse,'Time':total_time}
performance = performance.append(newResult,ignore_index=True)
performance

## **SVR**

***it's taking too much time, i'll use it with cuml***

> %%time
> #Define The Model
> svr = sk_SVR(kernel='linear',cache_size=5000,C=10,verbose=6)
> #Fitting The Data
> svr.fit(X_train,Y_train)
> #Test on Testing Data
> svr_pred = svr.predict(X_test)
> #Calculate The Error
> rmse = mean_squared_error(Y_test,svr_pred,squared=False)
> accuracies['svr_sklearn'] = rmse
> rmse

## **Decision Tree**

In [None]:
%%time
start_time = time.time()
#Define The Model
dt_regressor = DecisionTreeRegressor()
#Fitting The Data
dt_regressor.fit(X_train,Y_train)
#Test on Testing Data
dt_pred = dt_regressor.predict(X_test)
#Calculate The Error
rmse = mean_squared_error(Y_test,dt_pred,squared=False)
total_time= time.time() - start_time
rmse

In [None]:
newResult = {'Model Name':'dt_regressor','RMSE':rmse,'Time':total_time}
performance = performance.append(newResult,ignore_index=True)
performance

# **Ensemble Models**

## **Random Forest**

In [None]:
%%time
start_time = time.time()
RFR = RandomForestRegressor()
RFR.fit(X_train,Y_train)
#Test on Testing Data
RFR_pred = RFR.predict(X_test)
#Calculate The Error
rmse = mean_squared_error(Y_test,RFR_pred,squared=False)
total_time= time.time() - start_time
rmse

In [None]:
newResult = {'Model Name':'RandomForestRegressor','RMSE':rmse,'Time':total_time}
performance = performance.append(newResult,ignore_index=True)
performance

## **LightGBM** 

In [None]:
%%time
start_time = time.time()
lgbm = LGBMRegressor(n_estimators=100)
lgbm.fit(X_train,Y_train)
#Test on Testing Data
lgbm_pred = lgbm.predict(X_test)
#Calculate The Error
rmse = mean_squared_error(Y_test,lgbm_pred,squared=False)
total_time= time.time() - start_time
rmse

In [None]:
newResult = {'Model Name':'lgbm','RMSE':rmse,'Time':total_time}
performance = performance.append(newResult,ignore_index=True)
performance

## **XGBoost**

In [None]:
%%time
start_time = time.time()
XGB = XGBRegressor(n_estimators=100, learning_rate=0.03, n_jobs=8) 
XGB.fit(X_train, Y_train) # Your code here
y_pred = XGB.predict(X_test)
rmse = mean_squared_error(Y_test,y_pred,squared=False)
total_time= time.time() - start_time
rmse

In [None]:
newResult = {'Model Name':'XGB','RMSE':rmse,'Time':total_time}
performance = performance.append(newResult,ignore_index=True)
performance

# **Using CUML**
Sklearn is Slow Bcoz it Works on CPU So, I'll Try To Use CUML which Runs On GPU

In [None]:
X_train, X_test, Y_train, Y_test  = cp.array(X_train),cp.array(X_test),cp.array(Y_train),cp.array(Y_test)

## **Linear Reg**

In [None]:
%%time
start_time = time.time()
lr = LinearRegression(fit_intercept = True, normalize = False,
                      algorithm = "eig")

reg = lr.fit(X_train,Y_train)
linear_cuml_preds = lr.predict(X_test)
rmse = mean_squared_error(np.array(Y_test.get()),np.array(linear_cuml_preds.get()),squared=False)
total_time= time.time() - start_time
rmse

In [None]:
newResult = {'Model Name':'linear_cuml','RMSE':rmse,'Time':total_time}
performance = performance.append(newResult,ignore_index=True)
performance

## **ElasticNet**

In [None]:
%%time
start_time = time.time()
enet = ElasticNet(alpha = 0.1, l1_ratio=0.5)
reg = enet.fit(X_train,Y_train)
enet_cuml_preds = enet.predict(X_test)
rmse = mean_squared_error(np.array(Y_test.get()),np.array(enet_cuml_preds.get()),squared=False)
total_time= time.time() - start_time
rmse


In [None]:
newResult = {'Model Name':'elasticNet_cuml','RMSE':rmse,'Time':total_time}
performance = performance.append(newResult,ignore_index=True)
performance

## **SVR**

In [None]:
%%time
start_time = time.time()
reg = cu_SVR(kernel='linear', gamma='scale', epsilon=0.1,verbose= 6)
reg = reg.fit(X_train,Y_train)
svr_cuml_preds = reg.predict(X_test)
rmse = mean_squared_error(np.array(Y_test.get()),np.array(svr_cuml_preds.get()),squared=False)
total_time= time.time() - start_time
rmse

In [None]:
newResult = {'Model Name':'SVR_cuml','RMSE':rmse,'Time':total_time}
performance = performance.append(newResult,ignore_index=True)
performance

# **Visualize Results**

In [None]:
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
plt.figure(figsize=[16,6])

plt.subplot(1, 2, 1) # row 1, col 2 index 1
plt.title("Models' RMSE (Less is Better)")
sns.barplot(x='Model Name',y='RMSE',data=performance,)
plt.xticks(rotation="90")

plt.subplot(1, 2, 2) # row 1, col 2 index 2
plt.title("Models' Training and Evaluating Time (Less is Better)")
sns.barplot(x='Model Name',y='Time',data=performance,)
plt.xticks(rotation="90")
