In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve, accuracy_score, classification_report
from datetime import datetime

#Suppressing warnings
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
df= pd.read_csv("../input/tabular-playground-series-jan-2022/train.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info(verbose=True)

In [None]:
df.describe().T

In [None]:
null_values = round((df.isnull().mean())*100,2).sort_values(ascending = False)
null_values

In [None]:
unique_val_list=[]

for i in df.columns:
    if df[i].nunique()==1:
        unique_val_list.append(i)
           
print("List of Features having unique value and NAN :\n\n",unique_val_list)

In [None]:
col = df.columns
col

In [None]:

plt.figure(figsize=(20, 12))
plt.subplot(2,2,1)
sns.boxplot(x = 'country', y = 'num_sold', data = df)

plt.figure(figsize=(20, 12))
plt.subplot(2,2,2)
sns.boxplot(x = 'store', y = 'num_sold', data = df)

plt.figure(figsize=(20, 12))
plt.subplot(2,2,3)
sns.boxplot(x = 'product', y = 'num_sold', data = df)

In [None]:
def dist_plot(col):
    plt.figure(figsize=(10,7), facecolor='white')
    sns.distplot(df[col],bins=20,kde=True)
    plt.suptitle("Distribution of target", fontsize=20)
    plt.show()

dist_plot("num_sold")

In [None]:
df["num_sold"] = np.log(df["num_sold"])

In [None]:
df1 = pd.get_dummies(df['country'], drop_first = True)
df2 = pd.get_dummies(df['store'], drop_first = True)
df3 = pd.get_dummies(df['product'], drop_first = True)
df = pd.concat([df,df1,df2,df3], axis = 1)

In [None]:
df.head()

In [None]:
def create_time_features(df: pd.DataFrame) -> pd.DataFrame:
   
    df['date'] = pd.to_datetime(df['date']) # Convert the date to datetime.
    
    # Start the creating future process.
    df['year'] = df['date'].dt.year
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofmonth'] = df['date'].dt.days_in_month
    df['dayofyear'] = df['date'].dt.dayofyear
    df['weekofyear'] = df['date'].dt.weekofyear
    df['weekday'] = df['date'].dt.weekday
    df['is_weekend'] = np.where((df['weekday'] == 5) | (df['weekday'] == 6), 1, 0)
    
    return df
df = create_time_features(df)
df.head()

In [None]:
df= df.drop(["country","store","product","date","row_id"],axis=1)

In [None]:
Y = df["num_sold"]
X = df.loc[:, df.columns != "num_sold"]
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=42)

print("Shape of Train DataSet:",X_train.shape, Y_train.shape)
print("Shape of Test DataSet:",X_test.shape, Y_test.shape)

In [None]:
def dist_plots(df):
    plt.figure(figsize=(10,5))
    plt.title("Distribution Plot")
    sns.distplot(df)
    sns.despine()
    plt.show()
print(df['num_sold'].skew())

In [None]:

dist_plots(df['num_sold'])

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.svm import SVR

In [None]:
nrow, ncol = X_train.shape
print('No of Row: ',nrow)
print('No of Columns: ',ncol)

In [None]:
def scores(i):
    lin = i()
    lin.fit(X_train, Y_train)
    y_pred = lin.predict(X_test)
    lin_r = r2_score(Y_test, y_pred)
    s.append(lin_r)

    adj_r2_score = 1 - (((1-lin_r)*(nrow-1))/(nrow-1-ncol))
    s1.append(adj_r2_score)

    errors = abs(Y_test - y_pred)
    err = (Y_test + y_pred)/2
    smape = np.mean((errors/err)*100)
    
    mape = 100 * np.mean(errors / Y_test)
    accuracy = 100 - mape
    s2.append(accuracy)
    s3.append(mape)
    s4.append(smape)    

    MAE = np.abs(Y_test - y_pred).mean()
    s5.append(MAE)

    MSE = ((Y_test - y_pred)**2).mean()
    s6.append(MSE)

    RMSE = np.sqrt(((Y_test - y_pred)**2).mean())
    s7.append(RMSE)

algos = [LinearRegression,KNeighborsRegressor, RandomForestRegressor, Lasso, ElasticNet,XGBRegressor, 
         DecisionTreeRegressor, GradientBoostingRegressor, SVR]
s = []
s1 = []
s2 = []
s3 = []
s4 = []
s5 = []
s6 = []
s7 = []
s8 = []
for i in algos:
    scores(i)

In [None]:
models = pd.DataFrame({
    'Method': ['LinearRegression', 'KNeighborsRegressor', 'RandomForestRegressor', 'Lasso', 'ElasticNet','XGBRegressor',
               'DecisionTreeRegressor', 'GradientBoostingRegressor', 'SVR'],
    'r2 Scores' : [s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8]],
    'Ajd r2 Score' : [s1[0], s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7], s1[8]],
    'Accuracy' : [s2[0], s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], s2[7], s2[8]],
    'MAPE' : [s3[0], s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], s3[7], s3[8]],
    'SMAPE' : [s4[0], s4[1], s4[2], s4[3], s4[4], s4[5], s4[6], s4[7], s4[8]],
    'MAE' : [s5[0], s5[1], s5[2], s5[3], s5[4], s5[5], s5[6], s5[7], s5[8]],
    'MSE' : [s6[0], s6[1], s6[2], s6[3], s6[4], s6[5], s6[6], s6[7], s6[8]],
    'RMSE' : [s7[0], s7[1], s7[2], s7[3], s7[4], s7[5], s7[6], s7[7], s7[8]]
})
models.sort_values(by='r2 Scores', ascending=False)

In [None]:
model= XGBRegressor()
model.fit(X_train, Y_train)
y_pred_final = model.predict(X_test)
lin_r = r2_score(Y_test, y_pred_final)


In [None]:
plt.scatter(Y_test, y_pred_final, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("y_pred")
plt.show()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Create the random grid for the XGBoost model

params = {
 "learning_rate" : [0.05,0.10,0.15,0.20,0.25,0.30],
 "max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma": [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
}

In [None]:
xgboost_model=XGBRegressor()
xgb_model_tuned = RandomizedSearchCV(estimator = xgboost_model, param_distributions = params, 
                                     scoring='neg_mean_squared_error', n_iter = 50, cv = 5, verbose=2, 
                                     random_state=42, n_jobs = 1)

In [None]:
xgb_model_tuned.fit(X_train,Y_train)

In [None]:
xgb_model_tuned.best_params_

In [None]:
prediction = xgb_model_tuned.predict(X_test)

In [None]:
tun_score = r2_score(Y_test, prediction)

In [None]:
tun_score

In [None]:
test = pd.read_csv("../input/tabular-playground-series-jan-2022/test.csv")
test.head()

In [None]:
d1 = pd.get_dummies(test['country'], drop_first = True)
d2 = pd.get_dummies(test['store'], drop_first = True)
d3 = pd.get_dummies(test['product'], drop_first = True)
test = pd.concat([test,d1,d2,d3], axis = 1)

In [None]:
test = create_time_features(test)
test.head()

In [None]:
test= test.drop(["country","store","product","date","row_id"],axis=1)

In [None]:
test_prediction = xgb_model_tuned.predict(test)

In [None]:
predication_output = pd.DataFrame({'row_id': test.index,'num_sold': test_prediction })


In [None]:
predication_output.head()