In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2022/test.csv")

# **EDA/Visualization**

**Exploring dataframe**

In [None]:
train.columns

In [None]:
## unique values in each category

country_list = train['country'].unique()
store_list = train['store'].unique()
product_list = train['product'].unique()

print(f'Country List :{country_list}')
print(f'Store List :{store_list}')
print(f'Product List :{product_list}')


In [None]:
train['country'].value_counts(),train['store'].value_counts(),train['product'].value_counts()

In [None]:
train.describe()

In [None]:
def get_all_cols(df , target , exclude=[]):
    
    #Select categorical columns 
    object_cols = [cname for cname in train.columns 
                  if train[cname].dtype == 'object']
    
     #Select numarical columns 
    num_cols = [cname for cname in train.columns 
                  if train[cname].dtype != 'object']
    
    all_cols = object_cols+num_cols
    
    exclude_cols = exclude +[target]
    
    feature_cols =[col for col in all_cols if col not in exclude_cols]
    
    return object_cols , exclude_cols , feature_cols

In [None]:
object_cols, num_cols, feature_cols = get_all_cols(train, 'num_sold', exclude=['row_id', 'date', 'num_sold'])

In [None]:
object_cols , num_cols , feature_cols

In [None]:
def evaluate_time(df):
    min_date = df['date'].min()
    max_date = df['date'].max()
    
    print(f'Min Date :{min_date} /Max Date :{max_date}')
    
    
    
    
evaluate_time(train)
evaluate_time(test)

# Plotting

**Time-Series Plot**

In [None]:
km_df = train[train['store'] =="KaggleMart"]
kr_df = train[train['store']=='KaggleRama']

In [None]:
km_group_df = km_df.groupby(['date'])['num_sold'].sum()
km_group_df.plot(figsize=(10,5))

In [None]:
kr_grouped_df = kr_df.groupby(['date'])['num_sold'].sum()
kr_grouped_df.plot(figsize = (10,5))

In [None]:
km_groupby_series = km_df.groupby(by =['product'],as_index=False)['num_sold'].sum()
kr_groupby_series = kr_df.groupby(by =['product'],as_index=False)['num_sold'].sum()

km_grouped_df= km_groupby_series.reset_index()
kr_grouped_df= kr_groupby_series.reset_index()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('white') # darkgrid, white grid, dark, white and ticks
colors = sns.color_palette('pastel') # Color palette to use
plt.rc('axes', titlesize=18)     # fontsize of the axes title
plt.rc('axes', labelsize=14)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=13)    # fontsize of the tick labels
plt.rc('ytick', labelsize=13)    # fontsize of the tick labels
plt.rc('legend', fontsize=13)    # legend fontsize
plt.rc('font', size=13)          # controls default text sizes
sns.barplot(data=km_grouped_df, x='product', y= 'num_sold');

In [None]:
sns.barplot(data=kr_grouped_df, x='product', y= 'num_sold')

In [None]:
sns.catplot(y = "num_sold", x = "product", data = train.sort_values("num_sold", ascending = False), kind="violin", height = 4, aspect = 3)
plt.show()

In [None]:
def dist_plots(df):
    plt.figure(figsize=(10,5))
    plt.title("Distribution Plot")
    sns.distplot(df)
    sns.despine()
    plt.show()

In [None]:
print(train['num_sold'].skew())
dist_plots(train['num_sold'])

In [None]:
train['num_sold']= np.log(train['num_sold']) 

In [None]:
print(train['num_sold'].skew())
dist_plots(train['num_sold'])

In [None]:
def box_plots(df):
    plt.figure(figsize=(10,5))
    plt.title("Box Plot")
    sns.boxplot(df)
    plt.show()

In [None]:
box_plots(train['num_sold'])

In [None]:
perecentile25 = train['num_sold'].quantile(.25)
perecentile75 = train['num_sold'].quantile(.75)

iqr = perecentile75 -perecentile25

upper_limit = perecentile75+ 1.5 * iqr
lower_limit = perecentile25 -1.5*iqr

In [None]:
train = train[(train['num_sold'] < upper_limit) & (train['num_sold'] > lower_limit)]
train.shape

In [None]:
box_plots(train['num_sold'])

In [None]:
# Convert the Categorical variables to one-hot encoded features...
# It will help in the training process
def create_one_hot(df, categ_colums):
    """
    Creates one_hot encoded fields for the specified categorical columns...
    Args
        df
        categ_colums
    Returns
        df
    """
    df = pd.get_dummies(df, columns=categ_colums)
    return df

In [None]:
train['year'] = pd.to_datetime(train['date']).dt.year
train['week'] = pd.to_datetime(train['date']).dt.week
train['day'] = pd.to_datetime(train['date']).dt.day
train['weekday'] = pd.to_datetime(train['date']).dt.dayofweek

test['year'] = pd.to_datetime(test['date']).dt.year
test['week'] = pd.to_datetime(test['date']).dt.week
test['day'] = pd.to_datetime(test['date']).dt.day
test['weekday'] = pd.to_datetime(test['date']).dt.dayofweek

train.drop('date', axis=1, inplace=True)
test.drop('date', axis=1, inplace=True)
train.head()

In [None]:
train.info()

In [None]:
object_cols, num_cols, feature_cols = get_all_cols(train, target='num_sold', exclude=['row_id', 'num_sold'])

In [None]:
object_cols , num_cols

In [None]:
train_df = create_one_hot(train, object_cols)
test_df = create_one_hot(test, object_cols)

In [None]:
# train_df = create_one_hot(train_df, ['year'])
# test_df = create_one_hot(test_df, ['year'])

In [None]:
train_df

In [None]:
test_df

In [None]:
train_df.drop(columns=['row_id'],inplace =True)


In [None]:
ytrain = train_df['num_sold']
xtrain = train_df.drop(columns=['num_sold'])


In [None]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train ,y_test = train_test_split(xtrain , ytrain , test_size=.3,random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.svm import SVR

In [None]:
nrow, ncol = xtrain.shape
print('No of Row: ',nrow)
print('No of Columns: ',ncol)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

def scores(i):
    lin = i()
    lin.fit(x_train, y_train)
    y_pred = lin.predict(x_test)
    print(y_pred ,y_test)
    lin_r = r2_score(y_test, y_pred)
    s.append(lin_r)

    adj_r2_score = 1 - (((1-lin_r)*(nrow-1))/(nrow-1-ncol))
    s1.append(adj_r2_score)

    errors = abs(y_test - y_pred)
    err = (y_test + y_pred)/2
    smape = np.mean((errors/err)*100)
    
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    s2.append(accuracy)
    s3.append(mape)
    s4.append(smape)    

    MAE = np.abs(y_test - y_pred).mean()
    s5.append(MAE)

    MSE = ((y_test - y_pred)**2).mean()
    s6.append(MSE)

    RMSE = np.sqrt(((y_test - y_pred)**2).mean())
    s7.append(RMSE)

algos = [LinearRegression,KNeighborsRegressor, RandomForestRegressor, Lasso, ElasticNet, DecisionTreeRegressor, GradientBoostingRegressor, SVR]
s = []
s1 = []
s2 = []
s3 = []
s4 = []
s5 = []
s6 = []
s7 = []
for i in algos:
  scores(i)

In [None]:
models = pd.DataFrame({
    'Method': ['LinearRegression', 'KNeighborsRegressor', 'RandomForestRegressor', 'Lasso', 'ElasticNet',
               'DecisionTreeRegressor', 'GradientBoostingRegressor', 'SVR'],
    'r2 Scores' : [s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7]],
    'Ajd r2 Score' : [s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7]],
    'Accuracy' : [s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], s2[7]],
    'MAPE' : [s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], s3[7]],
    'SMAPE' : [s4[0], s4[1], s4[2], s4[3], s4[4], s4[5], s4[6], s4[7]],
    'MAE' : [s5[0], s5[1], s5[2], s5[3], s5[4], s5[5], s5[6], s5[7]],
    'MSE' : [s6[0], s6[1], s6[2], s6[3], s6[4], s6[5], s6[6], s6[7]],
    'RMSE' : [s7[0], s7[1], s7[2], s7[3], s7[4], s7[5], s7[6], s7[7]]
})
models.sort_values(by='r2 Scores', ascending=False)

In [None]:
rfr_Model = RandomForestRegressor()
rfr_Model.fit(xtrain, ytrain)
y_pred = rfr_Model.predict(x_test)
r2Score = r2_score(y_test, y_pred)
print('R2 Score',r2Score)         #0.9730024554109795,    0.973003845604028,    0.9781682423157221  
print('--------------------------------------------------------------')
x = 1-r2Score
y = nrow-1
z = nrow-1-ncol
adj_r2_score = 1 - ((x*y)/z)
print('Adjusted r2 Score',adj_r2_score)
print('--------------------------------------------------------------')
errors = abs(y_test - y_pred)
err = (y_test + y_pred)/2
smape = np.mean((errors/err)*100)
print('sMAPE',smape)
print('--------------------------------------------------------------')
errors = abs(y_test - y_pred)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Accuracy = {:0.2f}%.'.format(accuracy))
print('--------------------------------------------------------------')
print('MAE', np.abs(y_pred-y_test).mean())
print('--------------------------------------------------------------')
print('RMSE: ', np.sqrt(((y_test - y_pred)**2).mean()))
print('--------------------------------------------------------------')
print('MSE: ', ((y_test - y_pred)**2).mean())

In [None]:
row_id = test_df['row_id']
test_df.drop(columns=['row_id'],inplace =True)
df_test_preds = rfr_Model.predict(test_df)

num_sold = pd.DataFrame(df_test_preds, columns=['num_sold'])
num_sold = np.exp(num_sold)
df_result = pd.concat([row_id.reset_index(drop=True), num_sold.reset_index(drop=True)], axis=1)

In [None]:
xtrain

In [None]:
test_df

In [None]:
df_result

In [None]:
df_result.to_csv('sample_submission.csv', index=False)

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit.head()