# KaggleMart vs KaggleRama

## Import the required libraries 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Import the train dataset 

In [None]:
sales = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
sales_test = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

## Understanding the data

In [None]:
sales.head()

In [None]:
sales.describe()

In [None]:
sales.shape

In [None]:
sales.isnull().sum()                    # to check the number of null values overall

## Get unique values for categorical features 

In [None]:
sales['store'].unique()

In [None]:
sales['country'].unique()

In [None]:
sales['product'].unique()

## Store-wise total units sold

In [None]:
total_sales = sales.groupby(sales['store']).sum()
total_sales

In [None]:
total_sales.drop(['row_id'], axis=1, inplace=True)

In [None]:
total_sales

## Total units sold considering all the cat features 

In [None]:
total_units = sales.groupby(['store', 'country', 'product']).sum()
total_units.drop(['row_id'], axis=1, inplace=True)
total_units

In [None]:
total_units.unstack().plot()
plt.xticks(rotation=45)
plt.xlabel('Store, Country')
plt.ylabel('Units Sold')
plt.title('Comparison of units sold')
plt.show()

## Data Visualization

In [None]:
def countplot_features(df, feature, title, color):
    '''Takes a column from the dataframe and plots the distribution (after count).'''
    
           
    plt.figure(figsize = (10, 5))
    
    sns.countplot(df[feature], color = color)
        
    plt.title(title, fontsize=15)
    plt.xticks(rotation=90)
    plt.show();

In [None]:
cat_features = ['country', 'store', 'product']
for feature in cat_features:
    fig = countplot_features(sales, feature=feature, title = "Frequency of "+ feature, color='red')

In [None]:
for i, col in enumerate(['country', 'store', 'product']):
    plt.figure(i)
    sns.catplot(x=col, y='num_sold', data=sales, kind='point', aspect=3)
    plt.xticks(rotation=90)

## Format the date feature 

In [None]:
sales['date'].dtype

In [None]:
sales['date'] = pd.to_datetime(sales['date'])

In [None]:
sales['date'].dtype

In [None]:
sales['year'] = sales['date'].dt.year
sales['month'] = sales['date'].dt.month
sales['day'] = sales['date'].dt.day

In [None]:
sales.head()

In [None]:
sales.drop('date', axis=1, inplace=True)

In [None]:
sales.tail()

In [None]:
sales_test['date'] = pd.to_datetime(sales_test['date'])

In [None]:
sales_test['year'] = sales_test['date'].dt.year
sales_test['month'] = sales_test['date'].dt.month
sales_test['day'] = sales_test['date'].dt.day

In [None]:
sales.head()

In [None]:
sales_test.drop(['date'], axis=1, inplace=True)

In [None]:
sales_test.tail()

# Modelling

In [None]:
# One hot encoder
sales = pd.get_dummies(sales)
sales_test = pd.get_dummies(sales_test)

In [None]:
X_train = sales.drop(['num_sold'], axis=1)
y_train = sales['num_sold']

X_train

In [None]:
from xgboost import XGBRegressor 
# define model
model = XGBRegressor(colsample_bytree = 0.7,
                     learning_rate = 0.07,
                     max_depth= 10,
                     min_child_weight= 4,
                     n_estimators = 700,
                     subsample = 0.7)
# fit model
model.fit(X_train, y_train)
# make a prediction
y_pred = model.predict(sales_test)

y_pred