In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:
train_df = pd.read_csv('data/train_cleaned.csv')
test_df = pd.read_csv('data/test_cleaned.csv')

In [59]:
train_df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Type_Combined
0,9.3,3,0.016047,249.8092,1999,3,1,1,3735.138,3
1,5.92,2,0.019278,48.2692,2009,3,3,4,443.4228,1
2,17.5,3,0.01676,141.618,1999,3,1,1,2097.27,3
3,19.2,2,0.070482,182.095,1998,3,3,2,732.38,3
4,8.93,1,0.070482,53.8614,1987,1,3,1,994.7052,2


In [33]:
# lets split the target data from the train data

y = train_df['Item_Outlet_Sales']
X = train_df.drop(['Item_Outlet_Sales'], axis = 1)
x_test = test_df

# lets print the shapes of these newly formed data sets
print("Shape of the x :", X.shape)
print("Shape of the y :", y.shape)
print("Shape of the test data :", x_test.shape)

Shape of the x : (8523, 9)
Shape of the y : (8523,)
Shape of the test data : (5681, 9)


## Scaling The Data

In [34]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [35]:
X

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Combined
0,9.300,3,0.016047,249.8092,1999,3,1,1,3
1,5.920,2,0.019278,48.2692,2009,3,3,4,1
2,17.500,3,0.016760,141.6180,1999,3,1,1,3
3,19.200,2,0.070482,182.0950,1998,3,3,2,3
4,8.930,1,0.070482,53.8614,1987,1,3,1,2
...,...,...,...,...,...,...,...,...,...
8518,6.865,3,0.056783,214.5218,1987,1,3,1,3
8519,8.380,2,0.046982,108.1570,2002,3,2,1,3
8520,10.600,1,0.035186,85.1224,2004,2,2,1,2
8521,7.210,2,0.145221,103.1332,2009,3,3,4,3


In [36]:
cols = [ 
    'Item_Weight',
    'Item_Visibility',
    'Item_MRP',
]
X[cols]

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP
0,9.300,0.016047,249.8092
1,5.920,0.019278,48.2692
2,17.500,0.016760,141.6180
3,19.200,0.070482,182.0950
4,8.930,0.070482,53.8614
...,...,...,...
8518,6.865,0.056783,214.5218
8519,8.380,0.046982,108.1570
8520,10.600,0.035186,85.1224
8521,7.210,0.145221,103.1332


In [37]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X[cols] =  sc.fit_transform(X[cols])
X.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Combined
0,-0.831187,3,-1.117123,1.747454,1999,3,1,1,3
1,-1.63081,2,-1.050818,-1.489023,2009,3,3,4,1
2,1.108727,3,-1.102496,0.01004,1999,3,1,1,3
3,1.510904,2,3.702459e-15,0.66005,1998,3,3,2,3
4,-0.918719,1,3.702459e-15,-1.39922,1987,1,3,1,2


In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [39]:
# lets print the shapes again 
print("Shape of the X Train :", X_train.shape)
print("Shape of the y Train :", y_train.shape)
print("Shape of the X test :", X_test.shape)
print("Shape of the y test :", y_test.shape)
print("Shape of the test data :", x_test.shape)

Shape of the X Train : (6818, 9)
Shape of the y Train : (6818,)
Shape of the X test : (1705, 9)
Shape of the y test : (1705,)
Shape of the test data : (5681, 9)


## Train The Model With XGBOOST

In [40]:
from sklearn.metrics import  r2_score,mean_squared_error
from math import sqrt

In [41]:
from xgboost import XGBRegressor
xgb = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0.0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.25, max_delta_step=0, max_depth=15,
             min_child_weight=1, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)


    
xgb.fit(X,y)

#Predicting the Test set results
y_pred_xgb = xgb.predict(X_train)
score = r2_score(y_train,y_pred_xgb)
print("Score of Training:",score)
print("RMSE : ",np.sqrt(mean_squared_error(y_train,y_pred_xgb)))

y_test_pred_xgb = xgb.predict(X_test)
score = r2_score(y_test,y_test_pred_xgb)
print("Score of Testing:",score)
print("RMSE :",np.sqrt(mean_squared_error(y_test,y_test_pred_xgb)))

Score of Training: 0.9831032089018564
RMSE :  223.56555390702675
Score of Testing: 0.9834213528159617
RMSE : 212.27403701634336


In [45]:
pd.DataFrame(np.c_[y_test,y_test_pred_xgb], columns = ["actualpredict","machinepredict"])

Unnamed: 0,actualpredict,machinepredict
0,1743.0644,1516.435059
1,356.8688,425.586761
2,377.5086,400.347321
3,5778.4782,5514.655762
4,2356.9320,2331.483643
...,...,...
1700,3004.0896,2702.736084
1701,890.8404,924.547363
1702,629.1810,549.862915
1703,253.0040,258.671417


## Test the Model

In [46]:
#dumb file
import pickle
filename = 'big-mart-sale-model_xgb.pkl'
pickle.dump(xgb, open(filename, 'wb'))

In [47]:
#open file
model_xgb = open("big-mart-sale-model_xgb.pkl","rb")
ml_model = pickle.load(model_xgb)

In [48]:
print(ml_model)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0.0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.25, max_delta_step=0, max_depth=15,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)


In [67]:
pred_args=[5.92,2,0.019278,48.2692,2009,3,3,4,1]
features_value = [np.array(pred_args)]

In [68]:
features_name = ['Item_Weight','Item_Fat_Content',
                 'Item_Visibility','Item_MRP','Outlet_Establishment_Year',
                 'Outlet_Size','Outlet_Location_Type','Outlet_Type',
                 'Item_Type_Combined']

In [69]:
df = pd.DataFrame(features_value, columns=features_name)
df

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Type_Combined
0,5.92,2.0,0.019278,48.2692,2009.0,3.0,3.0,4.0,1.0


In [70]:
output = ml_model.predict(df)

In [71]:
output[0]

4771.345

In [60]:
train_df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Type_Combined
0,9.3,3,0.016047,249.8092,1999,3,1,1,3735.138,3
1,5.92,2,0.019278,48.2692,2009,3,3,4,443.4228,1
2,17.5,3,0.01676,141.618,1999,3,1,1,2097.27,3
3,19.2,2,0.070482,182.095,1998,3,3,2,732.38,3
4,8.93,1,0.070482,53.8614,1987,1,3,1,994.7052,2
