In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
df = pd.read_csv("./SalesFINAL12312016.csv")
df.head()

Unnamed: 0,InventoryId,Store,Brand,Description,Size,SalesQuantity,SalesDollars,SalesPrice,SalesDate,Volume,Classification,ExciseTax,VendorNo,VendorName
0,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750mL,1,16.49,16.49,1/1/2016,750,1,0.79,12546,JIM BEAM BRANDS COMPANY
1,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750mL,2,32.98,16.49,1/2/2016,750,1,1.57,12546,JIM BEAM BRANDS COMPANY
2,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750mL,1,16.49,16.49,1/3/2016,750,1,0.79,12546,JIM BEAM BRANDS COMPANY
3,1_HARDERSFIELD_1004,1,1004,Jim Beam w/2 Rocks Glasses,750mL,1,14.49,14.49,1/8/2016,750,1,0.79,12546,JIM BEAM BRANDS COMPANY
4,1_HARDERSFIELD_1005,1,1005,Maker's Mark Combo Pack,375mL 2 Pk,2,69.98,34.99,1/9/2016,375,1,0.79,12546,JIM BEAM BRANDS COMPANY


In [3]:
#Determine the size of the dataframe
df.shape

(1048575, 14)

This implies we have 1,048,575 (near a million) rows and 14 columns(features) in our dataframe.

To perform **Demand Forecasting**, we will take a random sample from the dataframe, of the size = 40,000

In [4]:
df = df.sample(n = 40000)

## DATA ANALYSIS

In [5]:
print("\nSales Columns:")
print(df.columns.tolist())


Sales Columns:
['InventoryId', 'Store', 'Brand', 'Description', 'Size', 'SalesQuantity', 'SalesDollars', 'SalesPrice', 'SalesDate', 'Volume', 'Classification', 'ExciseTax', 'VendorNo', 'VendorName']


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40000 entries, 280017 to 6999
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   InventoryId     40000 non-null  object 
 1   Store           40000 non-null  int64  
 2   Brand           40000 non-null  int64  
 3   Description     40000 non-null  object 
 4   Size            40000 non-null  object 
 5   SalesQuantity   40000 non-null  int64  
 6   SalesDollars    40000 non-null  float64
 7   SalesPrice      40000 non-null  float64
 8   SalesDate       40000 non-null  object 
 9   Volume          40000 non-null  int64  
 10  Classification  40000 non-null  int64  
 11  ExciseTax       40000 non-null  float64
 12  VendorNo        40000 non-null  int64  
 13  VendorName      40000 non-null  object 
dtypes: float64(3), int64(6), object(5)
memory usage: 4.6+ MB


In [7]:
df.dtypes

InventoryId        object
Store               int64
Brand               int64
Description        object
Size               object
SalesQuantity       int64
SalesDollars      float64
SalesPrice        float64
SalesDate          object
Volume              int64
Classification      int64
ExciseTax         float64
VendorNo            int64
VendorName         object
dtype: object

**Observation**: We observe that, the SalesDate feature is of type "object". We need SalesDate later, to predict the SalesQuantity, so we need to handle it's data type.

In [8]:
df.describe()

Unnamed: 0,Store,Brand,SalesQuantity,SalesDollars,SalesPrice,Volume,Classification,ExciseTax,VendorNo
count,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0,40000.0
mean,40.261975,12173.010725,2.32285,31.560647,15.441432,946.23645,1.41335,1.311487,7021.5255
std,24.392342,12452.446682,3.539171,89.768378,12.906787,709.150813,0.492441,3.044562,8461.482082
min,1.0,58.0,1.0,0.99,0.99,50.0,1.0,0.01,105.0
25%,16.0,3662.0,1.0,10.99,8.99,750.0,1.0,0.16,3252.0
50%,39.0,6281.0,1.0,17.99,12.99,750.0,1.0,0.68,4425.0
75%,64.0,17981.0,2.0,31.99,18.99,1000.0,2.0,1.57,9552.0
max,79.0,90089.0,253.0,13279.97,449.99,18000.0,2.0,132.3,98450.0


**Observation**: We observe that, our minimum sales was 1.0 and maximum goes upto 108.0, This huge deviation will also affect our predictions. We will handle these extreme values(*Outliers*) too, later in this notebook.

In [9]:
df.isnull().sum()

InventoryId       0
Store             0
Brand             0
Description       0
Size              0
SalesQuantity     0
SalesDollars      0
SalesPrice        0
SalesDate         0
Volume            0
Classification    0
ExciseTax         0
VendorNo          0
VendorName        0
dtype: int64

**Observation**: We don't have any null values in our DataFrame. So far good to GO!

In [10]:
df.Description.value_counts()

Description
Smirnoff 80 Proof               283
Jim Beam                        268
Capt Morgan Spiced Rum          264
Jagermeister Liqueur            259
Jack Daniels No 7 Black         253
                               ... 
Zonin Prosecco Brut               1
Ch de Pierreux Brouilly Rsv       1
Kendall Jackson Vt RSV Syrah      1
Corazon de Agave Blanco           1
Double Canyon Heaven H Cab S      1
Name: count, Length: 3892, dtype: int64

**Observation**: In our dataframe, for some brands, we've enough number of rows to predict their sales, but for some brands we only have one row, which may impact our predictions.

## DATA PREPROCESSING

In [11]:
df['VendorName'] = df['VendorName'].str.strip()
df['Description'] = df['Description'].str.strip()

Convert the SalesDate to Date time Object

In [12]:
df['SalesDate'] = pd.to_datetime(df['SalesDate'])

In [13]:
df.dtypes

InventoryId               object
Store                      int64
Brand                      int64
Description               object
Size                      object
SalesQuantity              int64
SalesDollars             float64
SalesPrice               float64
SalesDate         datetime64[ns]
Volume                     int64
Classification             int64
ExciseTax                float64
VendorNo                   int64
VendorName                object
dtype: object

Extract the year, month and day from the SalesDate

In [14]:
df['year'] = df['SalesDate'].dt.year
df['month'] = df['SalesDate'].dt.month
df['day'] = df['SalesDate'].dt.day

In [15]:
df

Unnamed: 0,InventoryId,Store,Brand,Description,Size,SalesQuantity,SalesDollars,SalesPrice,SalesDate,Volume,Classification,ExciseTax,VendorNo,VendorName,year,month,day
280017,34_PITMERDEN_25339,34,25339,Le Haut Medoc de Lascombes,750mL,1,34.99,34.99,2016-01-23,750,2,0.11,4425,MARTIGNETTI COMPANIES,2016,1,23
976274,11_CARDEND_26949,11,26949,Sonoma-Cutrer Chard RRV,750mL,2,45.98,22.99,2016-02-26,750,2,0.22,1128,BROWN-FORMAN CORP,2016,2,26
59970,13_TARMSWORTH_5401,13,5401,Jagermeister Liqueur,200mL,1,5.99,5.99,2016-01-04,200,1,0.21,3089,SIDNEY FRANK IMPORTING CO,2016,1,4
815281,73_DONCASTER_36989,73,36989,Duck Pond Chard Wash,750mL,1,13.99,13.99,2016-01-30,750,2,0.11,10754,PERFECTA WINES,2016,1,30
707907,67_EANVERNESS_8218,67,8218,Camarena Reposado Tequila,1.75L,1,31.99,31.99,2016-01-09,1750,1,1.84,3252,E & J GALLO WINERY,2016,1,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603811,60_IRRAGIN_5735,60,5735,Malibu Rum,750mL,1,12.99,12.99,2016-01-11,750,1,0.79,17035,PERNOD RICARD USA,2016,1,11
861252,76_DONCASTER_22566,76,22566,LaBelle Winery 2013 Dry Rsl,750mL,1,13.99,13.99,2016-01-14,750,2,0.11,90032,LABELLE VYDS AND WINERY,2016,1,14
279641,34_PITMERDEN_24863,34,24863,Double Canyon Heaven H Cab S,750mL,2,47.98,23.99,2016-01-09,750,2,0.22,4425,MARTIGNETTI COMPANIES,2016,1,9
591585,60_IRRAGIN_23870,60,23870,Col-di-Sasso Sangiovese/Cab,750mL,1,8.99,8.99,2016-01-18,750,2,0.11,516,BANFI PRODUCTS CORP,2016,1,18


Drop the following Columns:
1) InventoryId; because each row has different ID
2) SalesDate; because we've already extracted the required info from this feature in year, month, and day columns.
3) Size; because it has some ambiguity in it.

In [16]:
df.drop(['InventoryId','SalesDate', 'Size'], inplace=True, axis=1)

In [17]:
df.head()

Unnamed: 0,Store,Brand,Description,SalesQuantity,SalesDollars,SalesPrice,Volume,Classification,ExciseTax,VendorNo,VendorName,year,month,day
280017,34,25339,Le Haut Medoc de Lascombes,1,34.99,34.99,750,2,0.11,4425,MARTIGNETTI COMPANIES,2016,1,23
976274,11,26949,Sonoma-Cutrer Chard RRV,2,45.98,22.99,750,2,0.22,1128,BROWN-FORMAN CORP,2016,2,26
59970,13,5401,Jagermeister Liqueur,1,5.99,5.99,200,1,0.21,3089,SIDNEY FRANK IMPORTING CO,2016,1,4
815281,73,36989,Duck Pond Chard Wash,1,13.99,13.99,750,2,0.11,10754,PERFECTA WINES,2016,1,30
707907,67,8218,Camarena Reposado Tequila,1,31.99,31.99,1750,1,1.84,3252,E & J GALLO WINERY,2016,1,9


### Removing the Outliers

In [18]:
z_scores = (df[['SalesQuantity', 'SalesDollars', 'SalesPrice', 'Volume', 'ExciseTax']] - df[['SalesQuantity', 'SalesDollars', 'SalesPrice', 'Volume', 'ExciseTax']].mean()) / df[['SalesQuantity', 'SalesDollars', 'SalesPrice', 'Volume', 'ExciseTax']].std()
threshold = 3
outliers = df[(np.abs(z_scores) > threshold).any(axis=1)]
df_cleaned = df[(np.abs(z_scores) <= threshold).all(axis=1)]

In [19]:
df_cleaned

Unnamed: 0,Store,Brand,Description,SalesQuantity,SalesDollars,SalesPrice,Volume,Classification,ExciseTax,VendorNo,VendorName,year,month,day
280017,34,25339,Le Haut Medoc de Lascombes,1,34.99,34.99,750,2,0.11,4425,MARTIGNETTI COMPANIES,2016,1,23
976274,11,26949,Sonoma-Cutrer Chard RRV,2,45.98,22.99,750,2,0.22,1128,BROWN-FORMAN CORP,2016,2,26
59970,13,5401,Jagermeister Liqueur,1,5.99,5.99,200,1,0.21,3089,SIDNEY FRANK IMPORTING CO,2016,1,4
815281,73,36989,Duck Pond Chard Wash,1,13.99,13.99,750,2,0.11,10754,PERFECTA WINES,2016,1,30
707907,67,8218,Camarena Reposado Tequila,1,31.99,31.99,1750,1,1.84,3252,E & J GALLO WINERY,2016,1,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603811,60,5735,Malibu Rum,1,12.99,12.99,750,1,0.79,17035,PERNOD RICARD USA,2016,1,11
861252,76,22566,LaBelle Winery 2013 Dry Rsl,1,13.99,13.99,750,2,0.11,90032,LABELLE VYDS AND WINERY,2016,1,14
279641,34,24863,Double Canyon Heaven H Cab S,2,47.98,23.99,750,2,0.22,4425,MARTIGNETTI COMPANIES,2016,1,9
591585,60,23870,Col-di-Sasso Sangiovese/Cab,1,8.99,8.99,750,2,0.11,516,BANFI PRODUCTS CORP,2016,1,18


In [20]:
df_cleaned.shape

(37698, 14)

**Observation** : Earlier we had 40,000 rows, after removing the outliers, we've 18,881 rows

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40000 entries, 280017 to 6999
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Store           40000 non-null  int64  
 1   Brand           40000 non-null  int64  
 2   Description     40000 non-null  object 
 3   SalesQuantity   40000 non-null  int64  
 4   SalesDollars    40000 non-null  float64
 5   SalesPrice      40000 non-null  float64
 6   Volume          40000 non-null  int64  
 7   Classification  40000 non-null  int64  
 8   ExciseTax       40000 non-null  float64
 9   VendorNo        40000 non-null  int64  
 10  VendorName      40000 non-null  object 
 11  year            40000 non-null  int32  
 12  month           40000 non-null  int32  
 13  day             40000 non-null  int32  
dtypes: float64(3), int32(3), int64(6), object(2)
memory usage: 4.1+ MB


### Applying One-Hot Encoding

In [22]:
description_dummies = pd.get_dummies(df_cleaned['Description'])
vendorname_dummies = pd.get_dummies(df_cleaned['VendorName'])
dfx = pd.concat([df_cleaned, description_dummies, vendorname_dummies], axis=1)
dfx.drop(['Description', 'VendorName'], axis=1, inplace=True)
dfx.head()

Unnamed: 0,Store,Brand,SalesQuantity,SalesDollars,SalesPrice,Volume,Classification,ExciseTax,VendorNo,year,...,VINEXTRA INC,VINEYARD BRANDS INC,VINILANDIA USA,VRANKEN AMERICA,WALPOLE MTN VIEW WINERY,WEIN BAUER INC,WESTERN SPIRITS BEVERAGE CO,WILLIAM GRANT & SONS INC,WINE GROUP INC,ZORVINO VINEYARDS
280017,34,25339,1,34.99,34.99,750,2,0.11,4425,2016,...,False,False,False,False,False,False,False,False,False,False
976274,11,26949,2,45.98,22.99,750,2,0.22,1128,2016,...,False,False,False,False,False,False,False,False,False,False
59970,13,5401,1,5.99,5.99,200,1,0.21,3089,2016,...,False,False,False,False,False,False,False,False,False,False
815281,73,36989,1,13.99,13.99,750,2,0.11,10754,2016,...,False,False,False,False,False,False,False,False,False,False
707907,67,8218,1,31.99,31.99,1750,1,1.84,3252,2016,...,False,False,False,False,False,False,False,False,False,False


In [23]:
dfx

Unnamed: 0,Store,Brand,SalesQuantity,SalesDollars,SalesPrice,Volume,Classification,ExciseTax,VendorNo,year,...,VINEXTRA INC,VINEYARD BRANDS INC,VINILANDIA USA,VRANKEN AMERICA,WALPOLE MTN VIEW WINERY,WEIN BAUER INC,WESTERN SPIRITS BEVERAGE CO,WILLIAM GRANT & SONS INC,WINE GROUP INC,ZORVINO VINEYARDS
280017,34,25339,1,34.99,34.99,750,2,0.11,4425,2016,...,False,False,False,False,False,False,False,False,False,False
976274,11,26949,2,45.98,22.99,750,2,0.22,1128,2016,...,False,False,False,False,False,False,False,False,False,False
59970,13,5401,1,5.99,5.99,200,1,0.21,3089,2016,...,False,False,False,False,False,False,False,False,False,False
815281,73,36989,1,13.99,13.99,750,2,0.11,10754,2016,...,False,False,False,False,False,False,False,False,False,False
707907,67,8218,1,31.99,31.99,1750,1,1.84,3252,2016,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603811,60,5735,1,12.99,12.99,750,1,0.79,17035,2016,...,False,False,False,False,False,False,False,False,False,False
861252,76,22566,1,13.99,13.99,750,2,0.11,90032,2016,...,False,False,False,False,False,False,False,False,False,False
279641,34,24863,2,47.98,23.99,750,2,0.22,4425,2016,...,False,False,False,False,False,False,False,False,False,False
591585,60,23870,1,8.99,8.99,750,2,0.11,516,2016,...,False,False,False,False,False,False,False,False,False,False


### Applying Min-Max Scaling

In [24]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
dfx.loc[:, "SalesDollars"] = scaler.fit_transform(df_cleaned[["SalesDollars"]].values.reshape(-1, 1))
dfx.loc[:, "SalesPrice"] = scaler.fit_transform(df_cleaned[["SalesPrice"]].values.reshape(-1, 1))

In [25]:
dfx

Unnamed: 0,Store,Brand,SalesQuantity,SalesDollars,SalesPrice,Volume,Classification,ExciseTax,VendorNo,year,...,VINEXTRA INC,VINEYARD BRANDS INC,VINILANDIA USA,VRANKEN AMERICA,WALPOLE MTN VIEW WINERY,WEIN BAUER INC,WESTERN SPIRITS BEVERAGE CO,WILLIAM GRANT & SONS INC,WINE GROUP INC,ZORVINO VINEYARDS
280017,34,25339,1,0.118512,0.641509,750,2,0.11,4425,2016,...,False,False,False,False,False,False,False,False,False,False
976274,11,26949,2,0.156820,0.415094,750,2,0.22,1128,2016,...,False,False,False,False,False,False,False,False,False,False
59970,13,5401,1,0.017428,0.094340,200,1,0.21,3089,2016,...,False,False,False,False,False,False,False,False,False,False
815281,73,36989,1,0.045314,0.245283,750,2,0.11,10754,2016,...,False,False,False,False,False,False,False,False,False,False
707907,67,8218,1,0.108055,0.584906,1750,1,1.84,3252,2016,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603811,60,5735,1,0.041828,0.226415,750,1,0.79,17035,2016,...,False,False,False,False,False,False,False,False,False,False
861252,76,22566,1,0.045314,0.245283,750,2,0.11,90032,2016,...,False,False,False,False,False,False,False,False,False,False
279641,34,24863,2,0.163791,0.433962,750,2,0.22,4425,2016,...,False,False,False,False,False,False,False,False,False,False
591585,60,23870,1,0.027885,0.150943,750,2,0.11,516,2016,...,False,False,False,False,False,False,False,False,False,False


## PREDICTING SALES DEMAND

In [26]:
y=dfx['SalesQuantity']
x=dfx.drop(['SalesQuantity'],axis=1)

In [27]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y)

In [28]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()

In [29]:
model.fit(x_train,y_train)

In [30]:
model.score(x_test,y_test)

0.9248806366047745

In [34]:
import warnings
warnings.filterwarnings('ignore')

In [35]:
def get_loc_index(VendorName, Description):
    loc_index = np.where((df['VendorName'] == VendorName) & (df['Description'] == Description))[0]
    
    # If index exists, return the first index (assuming unique matches), else return -1
    if len(loc_index) > 0:
        return loc_index[0]
    else:
        return -1
    

In [36]:
def predict_sales_quantity(VendorName, Description, Store, Brand, Volume, SalesDollars, SalesPrice, Classification, ExciseTax, VendorNo,year,month,day):
    X = np.zeros(len(x.columns))
    X[0] = Store
    X[1] = Brand
    X[2] = Volume
    X[3] = SalesDollars
    X[4] = SalesPrice
    X[5] = Classification
    X[6] = ExciseTax
    X[7] = VendorNo
    X[8] = year
    X[9] = month
    X[10]= day

    # Set the index corresponding to VendorName and Description to 1 if loc_index >= 0
    loc_index = get_loc_index(VendorName, Description)  # Assuming you have a function to get loc_index
    if loc_index >= 0:
        X[loc_index] = 1

    # Make prediction
    predicted_sales_quantity = model.predict([X])[0]  # Assuming lr is your trained model

    return predicted_sales_quantity

In [37]:
predict_sales_quantity('ULTRA BEVERAGE COMPANY LLP','Cecchi Sangiovese',1,18013,750,34.95,6.99,2,0.56,9165,2017,1,21)

2