In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [None]:
df_train = pd.read_csv('../input/train-foldscsv/train_folds.csv')
df_test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')
sample_sub = pd.read_csv('../input/tabular-playground-series-jan-2022/sample_submission.csv')

In [None]:
df_test.tail()

In [None]:
print(df_train.info())
print(df_test.info())

In [None]:
print(df_train.isnull().sum())
print(df_test.isnull().sum())

In [None]:
sns.heatmap(df_train.corr(), cmap='viridis')

In [None]:
print(df_train.info())
print(df_test.info())

In [None]:
dataset = [df_train, df_test]

In [None]:
dayofweek = {'Monday':0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
for data in dataset:
    data['date'] = pd.to_datetime(data['date'], format = "%Y-%m-%d")
    data['dayofweek'] = data['date'].dt.day_name().map(dayofweek)
    data['weekend'] = [1 if a in [5,6] else 0 for a in data['dayofweek']]

In [None]:
daily_average_sale = df_train.groupby('date')['num_sold'].mean()

In [None]:
daily_average_sale.values

In [None]:
daily_average_sale.index

In [None]:
fig = plt.figure(figsize=(50,10))
plt.bar(daily_average_sale.index,daily_average_sale.values,color=(0.1,0.1,0.1,0.1),edgecolor='blue')


plt.title('Daily Average Sales')
plt.ylabel('Number of Sales')

In [None]:
objects = ['country', 'store', 'product']
for i in objects:
    print(df_train[i].unique())
print('\n')
for i in objects:
    print(df_train[i].unique())

In [None]:
Finland_Kagglemart = df_train[(df_train['country'] == 'Finland') & (df_train['store'] == 'KaggleMart')]
Finland_Kagglemart

In [None]:
def sales_graph(country, store, product):
    df1 = df_train[(df_train['country'] == country) & (df_train['store'] == store)]
    df2 = df1[df1['product'] == product]
    plt.figure(figsize=(50, 10))
    print('IN STORE {} for PRODUCT {}'.format(store, product))
    plt.bar(df2['date'], df2['num_sold'])

In [None]:
sales_graph('Finland', 'KaggleMart', 'Kaggle Mug')

In [None]:
countries = ['Finland', 'Norway', 'Sweden']
stores = ['KaggleMart', 'KaggleRama']
products = ['Kaggle Hat', 'Kaggle Mug', 'Kaggle Sticker']

In [None]:
for i in stores:
    for j in products:
        sales_graph('Finland', i, j)

In [None]:
for i in stores:
    for j in products:
        sales_graph('Norway', i, j)

In [None]:
for i in stores:
    for j in products:
        sales_graph('Sweden', i, j)

In [None]:
df_train.head()

In [None]:
def create_time_features(df):
    df['Day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofmonth'] = df['date'].dt.days_in_month
    return df

In [None]:
df_train = create_time_features(df_train)
df_test = create_time_features(df_test)

## Preprocessing the datast for traing the model

In [None]:
df_train

In [None]:
from xgboost import XGBRegressor

In [None]:
useful_features = [i for i in df_train.columns if i not in ['kfold', 'row_id', 'date', 'num_sold']]
cat_cols = ['store', 'product', 'country']
print(useful_features)
print(cat_cols)
df_test = df_test[useful_features]
df_test

In [None]:
df_train[useful_features]

In [None]:
valid_preds = []
final_predslist = []
print(cat_cols)
xg_boost_params = {'max_depth': 7, 'alpha':0.08, 'eval_metric' : 'rmse',
                  'tree_method': 'gpu_hist'}
for fold in range(5):
    x_train = df_train[df_train['kfold']!=fold].reset_index(drop=True)  
    x_valid = df_train[df_train['kfold']==fold].reset_index(drop= True)
    
    y_train = x_train['num_sold']
    y_valid = x_valid['num_sold']
    
    x_test = df_test.copy()
    
    x_train = x_train[useful_features]
    x_valid = x_valid[useful_features]
    
#     print(x_train.shape)
#     print(y_train.shape)
    
    
    le = OrdinalEncoder()
    x_train[cat_cols] = le.fit_transform(x_train[cat_cols])
    x_valid[cat_cols] = le.transform(x_valid[cat_cols])
    x_test[cat_cols] = le.transform(x_test[cat_cols])

    mod = XGBRegressor(**xg_boost_params)
    mod.fit(x_train, y_train)
    preds = mod.predict(x_valid)
    final_preds = mod.predict(x_test)
    valid_preds.append(preds)
    final_predslist.append(final_preds)
    print(fold, mean_squared_error(y_valid, preds, squared = False))

In [None]:
sample_sub

In [None]:
preds = np.mean(np.column_stack(final_predslist), axis = 1)

In [None]:
sample_sub['num_sold'] = preds

In [None]:
sample_sub.to_csv("submission.csv", index=False)