In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing the libraries:

In [None]:
# Importing tools for preprocessing and feature engineering of the data:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import LabelEncoder

# Importing plotting tools to plot the data:
import seaborn as sns
import matplotlib.pyplot as plt

# Importing Algorithms so that I can train the data:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

# Importing a loss tool to check how well our algorithms is doing:
from keras.losses import mean_absolute_percentage_error


Reading the data:


In [None]:
data = pd.read_csv("../input/predict-demand/train.csv")
data.head()

Checking if our data has NaN values:

In [None]:
data.isnull().sum()

Yep it does :(

Dropping the rows with NaN values:

In [None]:
dropdata = data.dropna()

Defining the labels to predict on:

In [None]:
y = dropdata['quantity']

dropdata.drop(['quantity', 'id'], inplace=True, axis=1)

# Data Visualisation:


Lets try plotting the demand in various cities:

In [None]:
sns.set_context("poster", font_scale=.7)
plt.figure(figsize=(7,7))
sns.set_palette('RdYlBu')
sns.countplot(dropdata['city'])

Lets take a look at the dustribution of sales across the various shops:

In [None]:
sns.set_palette('PiYG')
plt.figure(figsize=(10,10))
sns.set_context("poster", font_scale=0.7)
sns.countplot(dropdata['shop'])

Lets see the relation between the quanity sold and the prices:

In [None]:
sns.set_palette('RdPu')
plt.figure(figsize=(10,10))
sns.set_context("poster", font_scale=0.7)
sns.scatterplot(data = dropdata, y='price', x=y, hue='capacity')

Here is the countplot for the brands:

In [None]:
sns.set_palette('YlOrRd')
plt.figure(figsize=(10,10))
sns.set_context("poster", font_scale=0.7)
sns.countplot(dropdata['brand'])

# Preprocessing:

Finding the categorical columns in our dataset's index:

In [None]:
c = (data.dtypes == 'object')
categorical_col = list(c[c].index)

Replacing the categorical values with numerical probabilities of them happening:


In [None]:
enc = CatBoostEncoder()
enc.fit(dropdata[categorical_col], y)
dropdata[categorical_col] = enc.transform(dropdata[categorical_col])

Finally splitting the data:

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(dropdata, y, train_size=0.9, test_size=0.1)

# Predicting:

Training, fitting and predicting on the data using XGBoost:

In [None]:
xgmodel = XGBRegressor(n_estimators=1000)

xgmodel.fit(xtrain, ytrain)

xgPreds = xgmodel.predict(xtest)



In [None]:
print('The Mean Accuracy for XGBoost Regressor model is,', 100 - mean_absolute_percentage_error(ytest, xgPreds), '%')

Thank you for going through this notebook I am glad you made it all the way down here! 