## Importing Testing Dataset

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from joblib import dump,load
import sys

In [2]:
dataset=pd.read_csv("Datasets//Test.csv")

In [3]:
dataset.head(10)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3
5,FDH56,9.8,Regular,0.063817,Fruits and Vegetables,117.1492,OUT046,1997,Small,Tier 1,Supermarket Type1
6,FDL48,19.35,Regular,0.082602,Baking Goods,50.1034,OUT018,2009,Medium,Tier 3,Supermarket Type2
7,FDC48,,Low Fat,0.015782,Baking Goods,81.0592,OUT027,1985,Medium,Tier 3,Supermarket Type3
8,FDN33,6.305,Regular,0.123365,Snack Foods,95.7436,OUT045,2002,,Tier 2,Supermarket Type1
9,FDA36,5.985,Low Fat,0.005698,Baking Goods,186.8924,OUT017,2007,,Tier 2,Supermarket Type1


In [4]:
dataset.shape

(5681, 11)

## Trying out attribute combinations

In [5]:
#Getting the first two characters of ID to separate them into different Categories

dataset['New_Item_Type'] = dataset['Item_Identifier'].apply(lambda x: x[:2])
dataset['New_Item_Type']
dataset['New_Item_Type'] = dataset['New_Item_Type'].map({'FD':'Food', 'NC':'Non-Consumable', 'DR':'Drinks'})
dataset['New_Item_Type'].value_counts()

Food              4076
Non-Consumable    1087
Drinks             518
Name: New_Item_Type, dtype: int64

In [6]:
#Combing reg,Regular and Low Fat,low fat and LF and add Non-Consumable/Non-Edible item 

dataset.loc[dataset['New_Item_Type']=='Non-Consumable', 'Item_Fat_Content'] = 'Non-Edible'
dataset['Item_Fat_Content'].value_counts()
dataset['Item_Fat_Content'] = dataset['Item_Fat_Content'].replace({'LF':'Low Fat', 'reg':'Regular', 'low fat':'Low Fat'})
dataset['Item_Fat_Content'].value_counts()

Low Fat       2581
Regular       2013
Non-Edible    1087
Name: Item_Fat_Content, dtype: int64

In [7]:
#Determining the operation period of a time

dataset['Outlet_Establishment_Year'] = 2013 - dataset['Outlet_Establishment_Year']

In [8]:
#Removing unnecassary columns from the dataset

dataset.drop("Item_Identifier",axis=1,inplace=True)

## Data cleaning

In [9]:
#checking all columns of dataset 

dataset.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'New_Item_Type'],
      dtype='object')

In [10]:
#hot encoding

dataset = pd.get_dummies(dataset, columns=['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'New_Item_Type'])

In [11]:
#label encoding
encode= LabelEncoder()
cat_col = ['Outlet_Identifier', 'Item_Type']
for col in cat_col:
     dataset[col] = encode.fit_transform(dataset[col])

In [12]:
#checking missing value
dataset.isnull().sum()

Item_Weight                      976
Item_Visibility                    0
Item_Type                          0
Item_MRP                           0
Outlet_Identifier                  0
Outlet_Establishment_Year          0
Item_Fat_Content_Low Fat           0
Item_Fat_Content_Non-Edible        0
Item_Fat_Content_Regular           0
Outlet_Size_High                   0
Outlet_Size_Medium                 0
Outlet_Size_Small                  0
Outlet_Location_Type_Tier 1        0
Outlet_Location_Type_Tier 2        0
Outlet_Location_Type_Tier 3        0
Outlet_Type_Grocery Store          0
Outlet_Type_Supermarket Type1      0
Outlet_Type_Supermarket Type2      0
Outlet_Type_Supermarket Type3      0
New_Item_Type_Drinks               0
New_Item_Type_Food                 0
New_Item_Type_Non-Consumable       0
dtype: int64

In [13]:
#adding missing values
imputer=SimpleImputer(strategy="median")
imputer.fit(dataset)

SimpleImputer(strategy='median')

In [14]:
imputer.statistics_

array([1.2500000e+01, 5.4154252e-02, 6.0000000e+00, 1.4141540e+02,
       5.0000000e+00, 1.4000000e+01, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       1.0000000e+00, 0.0000000e+00])

In [15]:
Imputer=imputer.transform(dataset)
dataset=pd.DataFrame(Imputer,columns=dataset.columns)

In [16]:
scaler=MinMaxScaler()
scaler.fit(dataset)

MinMaxScaler()

In [17]:
scaling=scaler.transform(dataset)
dataset=pd.DataFrame(scaling,columns=dataset.columns)

In [18]:
dataset.isnull().sum()

Item_Weight                      0
Item_Visibility                  0
Item_Type                        0
Item_MRP                         0
Outlet_Identifier                0
Outlet_Establishment_Year        0
Item_Fat_Content_Low Fat         0
Item_Fat_Content_Non-Edible      0
Item_Fat_Content_Regular         0
Outlet_Size_High                 0
Outlet_Size_Medium               0
Outlet_Size_Small                0
Outlet_Location_Type_Tier 1      0
Outlet_Location_Type_Tier 2      0
Outlet_Location_Type_Tier 3      0
Outlet_Type_Grocery Store        0
Outlet_Type_Supermarket Type1    0
Outlet_Type_Supermarket Type2    0
Outlet_Type_Supermarket Type3    0
New_Item_Type_Drinks             0
New_Item_Type_Food               0
New_Item_Type_Non-Consumable     0
dtype: int64

In [19]:
#covert all the dataframe values in Matrix 
final=np.array(dataset)


In [20]:
#checking the value

final[0]

array([0.96427508, 0.02337443, 0.86666667, 0.32341312, 1.        ,
       0.41666667, 1.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 1.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       1.        , 0.        ])

## Test The model

<b>import saving model and load 

In [21]:

model=load("project.joblib")

In [22]:
while True:
    column=dataset.count()
    print("\nTotal number of rows in a test dataset : ",len(dataset))
    print("Total number of column in a test dataset",len(column))
    test=int(input("Enter the row of a test dataset you want to predict: "))

    if test <= len(dataset):
        data=final[test]
        row=np.array([data])
        pred=model.predict(row)
        print("\nItem outlet sales: ",str(pred))
    else:
        print("Invalid Number ")
        sys.exit()


Total number of rows in a test dataset :  5681
Total number of column in a test dataset 22
Enter the row of a test dataset you want to predict: 67

Item outlet sales:  [4140.49494139]

Total number of rows in a test dataset :  5681
Total number of column in a test dataset 22
Enter the row of a test dataset you want to predict: 756

Item outlet sales:  [2104.08875594]

Total number of rows in a test dataset :  5681
Total number of column in a test dataset 22
Enter the row of a test dataset you want to predict: 4565

Item outlet sales:  [3641.67422842]

Total number of rows in a test dataset :  5681
Total number of column in a test dataset 22
Enter the row of a test dataset you want to predict: 65754
Invalid Number 


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
