In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

# loading the data from csv file to Pandas DataFrame
salesdata = pd.read_csv('/content/Train.csv')

# first 5 rows of the dataframe
print(salesdata.head())

# number of data points & number of features
salesdata.shape

# getting some information about thye dataset
salesdata.info()

# checking for missing values
salesdata.isnull().sum()

# mean value of "Item_Weight" column
salesdata['Item_Weight'].mean()

# filling the missing values in "Item_weight column" with "Mean" value
salesdata['Item_Weight'].fillna(salesdata['Item_Weight'].mean(), inplace=True)

# mode of "Outlet_Size" column
salesdata['Outlet_Size'].mode()

# filling the missing values in "Outlet_Size" column with Mode
outlet_size_mode = salesdata.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))
print(outlet_size_mode)

missing_values = salesdata['Outlet_Size'].isnull()
print(missing_values)

salesdata.loc[missing_values, 'Outlet_Size'] = salesdata.loc[missing_values,'Outlet_Type'].apply(lambda x: outlet_size_mode[x])

# checking for missing values
salesdata.isnull().sum()

salesdata.describe()

salesdata.head()

salesdata['Item_Fat_Content'].value_counts()
salesdata.replace({'Item_Fat_Content': {'low fat':'Low Fat','LF':'Low Fat', 'reg':'Regular'}}, inplace=True)
salesdata['Item_Fat_Content'].value_counts()

#label encoding
encoder = LabelEncoder()
salesdata['Item_Identifier'] = encoder.fit_transform(salesdata['Item_Identifier'])

salesdata['Item_Fat_Content'] = encoder.fit_transform(salesdata['Item_Fat_Content'])

salesdata['Item_Type'] = encoder.fit_transform(salesdata['Item_Type'])

salesdata['Outlet_Identifier'] = encoder.fit_transform(salesdata['Outlet_Identifier'])

salesdata['Outlet_Size'] = encoder.fit_transform(salesdata['Outlet_Size'])

salesdata['Outlet_Location_Type'] = encoder.fit_transform(salesdata['Outlet_Location_Type'])

salesdata['Outlet_Type'] = encoder.fit_transform(salesdata['Outlet_Type'])

print(salesdata.head())

#Splitting features and Target
X = salesdata.drop(columns='Item_Outlet_Sales', axis=1)
Y = salesdata['Item_Outlet_Sales']

#Splitting the data into Training data & Testing Data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

#model training
regressor = XGBRegressor()
regressor.fit(X_train, Y_train)

# prediction on training data
training_data_prediction = regressor.predict(X_train)
# R squared Value
r2_train = metrics.r2_score(Y_train, training_data_prediction)
print('R Squared value = ', r2_train)

# prediction on test data
test_data_prediction = regressor.predict(X_test)
print(test_data_prediction)



  Item_Identifier  Item_Weight  ...        Outlet_Type  Item_Outlet_Sales
0           FDA15         9.30  ...  Supermarket Type1          3735.1380
1           DRC01         5.92  ...  Supermarket Type2           443.4228
2           FDN15        17.50  ...  Supermarket Type1          2097.2700
3           FDX07        19.20  ...      Grocery Store           732.3800
4           NCD19         8.93  ...  Supermarket Type1           994.7052

[5 rows x 12 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   85

In [8]:

import pickle
pickle.dump(regressor,open('/content/model.pkl','wb'))

In [64]:
import pickle
import numpy as pd
import pandas as pd

model = pickle.load(open('/content/model.pkl','rb'))
df = pd.read_csv('/content/Train.csv')


def predict(df):
    cols_when_model_builds = model.get_booster().feature_names
    salesdata = df[cols_when_model_builds]

    salesdata=df.iloc[:,0:-1]
    print(salesdata)
    salesdata.isnull().sum()

    outlet_size_mode = salesdata.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))

    missing_values = salesdata['Outlet_Size'].isnull()

    salesdata.loc[missing_values, 'Outlet_Size'] = salesdata.loc[missing_values,'Outlet_Type'].apply(lambda x: outlet_size_mode[x])
    salesdata['Item_Fat_Content'].value_counts()
    salesdata.replace({'Item_Fat_Content': {'low fat':'Low Fat','LF':'Low Fat', 'reg':'Regular'}}, inplace=True)
    salesdata['Item_Fat_Content'].value_counts()
    #label encoding
    encoder = LabelEncoder()
    salesdata['Item_Identifier'] = encoder.fit_transform(salesdata['Item_Identifier'])

    salesdata['Item_Fat_Content'] = encoder.fit_transform(salesdata['Item_Fat_Content'])

    salesdata['Item_Type'] = encoder.fit_transform(salesdata['Item_Type'])

    salesdata['Outlet_Identifier'] = encoder.fit_transform(salesdata['Outlet_Identifier'])

    salesdata['Outlet_Size'] = encoder.fit_transform(salesdata['Outlet_Size'])

    salesdata['Outlet_Location_Type'] = encoder.fit_transform(salesdata['Outlet_Location_Type'])

    salesdata['Outlet_Type'] = encoder.fit_transform(salesdata['Outlet_Type'])
    salesdata=salesdata.iloc[1:,:];
    predictions = model.predict(np.reshape(salesdata.values.ravel(), (-1, 2)))
    return list(predictions)

print(predict(df))

     Item_Identifier  Item_Weight  ... Outlet_Location_Type        Outlet_Type
0              FDA15        9.300  ...               Tier 1  Supermarket Type1
1              DRC01        5.920  ...               Tier 3  Supermarket Type2
2              FDN15       17.500  ...               Tier 1  Supermarket Type1
3              FDX07       19.200  ...               Tier 3      Grocery Store
4              NCD19        8.930  ...               Tier 3  Supermarket Type1
...              ...          ...  ...                  ...                ...
8518           FDF22        6.865  ...               Tier 3  Supermarket Type1
8519           FDS36        8.380  ...               Tier 2  Supermarket Type1
8520           NCJ29       10.600  ...               Tier 2  Supermarket Type1
8521           FDN46        7.210  ...               Tier 3  Supermarket Type2
8522           DRG01       14.800  ...               Tier 1  Supermarket Type1

[8523 rows x 11 columns]


ValueError: ignored