<a href="https://colab.research.google.com/github/rimchristian/sales-prediction/blob/main/salesprediction_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(display='diagram')

In [None]:
filename = '/content/sales_predictions.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
# drop duplicates 
df.duplicated().sum() 
# no duplicates

0

In [None]:
# identifying missing data 
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [None]:
# finding the average of the "Item Weight"
mean_item_weight = df['Item_Weight'].mean().round(2)
mean_item_weight

12.86

In [None]:
df['Item_Weight'].fillna(mean_item_weight, inplace=True)
df.isna().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [None]:
# filling the missing values in the categorical variable 
most_common_size = df['Outlet_Size'].mode()
df['Outlet_Size'].fillna(most_common_size, inplace=True)


In [None]:
# filled out the missing values for the "Outlet Size column"
df['Outlet_Size'].fillna('Missing', inplace=True)

In [None]:
df.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [None]:
# split
X = df.drop(columns=['Item_Outlet_Sales'])
y = df['Item_Outlet_Sales']

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [None]:
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [None]:
# Instantiate an imputer and scaler 
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')
#scaler
scaler = StandardScaler()
# One hot Encoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [None]:
# Numeric pipeline
numeric_pipe = make_pipeline(mean_imputer,scaler)
numeric_pipe

In [None]:
# Categorical pipeline
categorical_pipe = make_pipeline(freq_imputer,ohe)
categorical_pipe

In [None]:
# Tuples for Columnn Transformer
num_tuple = (numeric_pipe, num_selector)
cat_tuple = (categorical_pipe,cat_selector)

# ColumnTransformer
preprocessor = make_column_transformer(num_tuple,cat_tuple)
preprocessor

In [None]:
# combine the imputer and the scaler into a pipeline 
preprocessing_pipeline = make_pipeline(freq_imputer, scaler)
preprocessing_pipeline

In [None]:
preprocessor.fit(X_train) # fit on the training dataset

In [None]:
# transforming train and test
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [None]:
#displaying the preprocessing steps
print(np.isnan(X_train_processed).sum().sum(), 'missing values in training data.')
print(np.isnan(X_test_processed).sum().sum(), 'missing values in testing data.')
print('\n')
print('All data in X_train_processed are', X_train_processed.dtype)
print('All data in X_test_processed are', X_test_processed.dtype)
print('\n')
print('shape of data is', X_train_processed.shape)
print('\n')
X_train_processed

0 missing values in training data.
0 missing values in testing data.


All data in X_train_processed are float64
All data in X_test_processed are float64


shape of data is (6392, 1596)




array([[ 0.81906839, -0.71277507,  1.82810922, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.55816129, -1.29105225,  0.60336888, ...,  0.        ,
         1.        ,  0.        ],
       [-0.12968469,  1.81331864,  0.24454056, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.11555373, -0.92052713,  1.52302674, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.76782147, -0.2277552 , -0.38377708, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.81906839, -0.95867683, -0.73836105, ...,  1.        ,
         0.        ,  0.        ]])