In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('test_AbJTz2l.csv')

In [3]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


## Preprocessing

In [4]:
df.isna().sum()

Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [5]:
df[df['Outlet_Size'].isna()]['Outlet_Identifier'].unique()

array(['OUT017', 'OUT010', 'OUT045'], dtype=object)

In [6]:
df.loc[df['Outlet_Identifier']=='OUT010','Outlet_Size'] = df[df['Outlet_Identifier']=='OUT010']['Outlet_Size'].fillna('Small')
df.loc[df['Outlet_Identifier']=='OUT045','Outlet_Size'] = df[df['Outlet_Identifier']=='OUT045']['Outlet_Size'].fillna('Small')
df.loc[df['Outlet_Identifier']=='OUT017','Outlet_Size'] = df[df['Outlet_Identifier']=='OUT017']['Outlet_Size'].fillna('Small')

In [7]:
df[df['Item_Weight'].isna()]['Item_Type'].unique()

array(['Dairy', 'Baking Goods', 'Health and Hygiene', 'Household',
       'Others', 'Fruits and Vegetables', 'Meat', 'Canned', 'Snack Foods',
       'Starchy Foods', 'Hard Drinks', 'Frozen Foods', 'Soft Drinks',
       'Breads', 'Breakfast', 'Seafood'], dtype=object)

In [8]:
# fill item weights null with mean of respective Item type
means = {}
items =list( df['Item_Type'].unique())
for i in items:
    means[i] = df[df['Item_Type'] == i]['Item_Weight'].mean()
# select each item type rows,Weight feature, fill missing value by corresp item type's mean weight
for item in means:
    df.loc[df['Item_Type'] == item,'Item_Weight'] = df['Item_Weight'].fillna(means[item])    

In [9]:
df.isna().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

In [10]:
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('low fat', 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('LF', 'Low Fat')
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace('reg', 'Regular')
df['Item_Fat_Content'].value_counts()

Low Fat    3668
Regular    2013
Name: Item_Fat_Content, dtype: int64

In [11]:
X =df.copy()

In [12]:
Xclean = df.copy() #for later use

### Encoding

In [13]:
#'Item_Identifier','Item_Fat_Content','Outlet_Location_Type','Outlet_Size'
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['Item_Identifier'] = le.fit_transform(X['Item_Identifier'])

from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
for f in ['Outlet_Location_Type','Outlet_Size']:
    X[f] = oe.fit_transform(X[[f]])
    
from sklearn.preprocessing import OneHotEncoder
X= pd.get_dummies(X, columns=['Outlet_Identifier','Item_Type','Outlet_Type','Item_Fat_Content'])    

## Model Application

In [14]:
# load model and predict
drop = ['Item_Fat_Content_Low Fat', 'Item_Fat_Content_Regular','Item_Visibility']
Xr =  X.drop(drop,axis =1)
import joblib
loaded_model = joblib.load('rf3drop.sav')
pred = loaded_model.predict(Xr)
new_test=df[['Item_Identifier','Outlet_Identifier']]
new_test.reset_index(drop=True,inplace=True)
new_test

Unnamed: 0,Item_Identifier,Outlet_Identifier
0,FDW58,OUT049
1,FDW14,OUT017
2,NCN55,OUT010
3,FDQ58,OUT017
4,FDY38,OUT027
...,...,...
5676,FDB58,OUT046
5677,FDD47,OUT018
5678,NCO17,OUT045
5679,FDJ26,OUT017


In [15]:
prediction=pd.DataFrame(pred)
new=pd.concat([new_test,prediction],axis=1).reset_index(drop=True)
new=pd.DataFrame(new)
new.columns.values[2]='Item_Outlet_Sales'

In [16]:
new=pd.DataFrame(new)
new

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1621.143276
1,FDW14,OUT017,1405.962398
2,NCN55,OUT010,569.564788
3,FDQ58,OUT017,2352.062097
4,FDY38,OUT027,5862.925203
...,...,...,...
5676,FDB58,OUT046,2094.164604
5677,FDD47,OUT018,2732.049810
5678,NCO17,OUT045,1933.909082
5679,FDJ26,OUT017,3542.239902


In [17]:
new[new['Item_Outlet_Sales'] <0]

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales


In [18]:
new.to_csv('solutionRf3drop.csv',index=False)