# Sales prediction system using linear regression

In [1]:
#importing libraries
import numpy as np
import pandas as pd
import  matplotlib.pyplot as plt

In [2]:
#importing datasets
dataframe = pd.read_csv(r'sales_prediction_datasets/train.csv') #raw string, treat everything as it is 

In [3]:
#complete information of the dataset
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [4]:
#shape of the dataset
dataframe.shape

(8523, 12)

In [5]:
dataframe.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [6]:
#converting dataframe columns to same case - lowercase
dataframe.columns = dataframe.columns.str.lower()

In [7]:
#complete information of the dataset
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   item_identifier            8523 non-null   object 
 1   item_weight                7060 non-null   float64
 2   item_fat_content           8523 non-null   object 
 3   item_visibility            8523 non-null   float64
 4   item_type                  8523 non-null   object 
 5   item_mrp                   8523 non-null   float64
 6   outlet_identifier          8523 non-null   object 
 7   outlet_establishment_year  8523 non-null   int64  
 8   outlet_size                6113 non-null   object 
 9   outlet_location_type       8523 non-null   object 
 10  outlet_type                8523 non-null   object 
 11  item_outlet_sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [8]:
#checking unique values
#how many unique values are in column
dataframe.nunique()

item_identifier              1559
item_weight                   415
item_fat_content                5
item_visibility              7880
item_type                      16
item_mrp                     5938
outlet_identifier              10
outlet_establishment_year       9
outlet_size                     3
outlet_location_type            3
outlet_type                     4
item_outlet_sales            3493
dtype: int64

In [9]:
dataframe.head()

Unnamed: 0,item_identifier,item_weight,item_fat_content,item_visibility,item_type,item_mrp,outlet_identifier,outlet_establishment_year,outlet_size,outlet_location_type,outlet_type,item_outlet_sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [10]:
#checking null values
#how many null values are in column and in which columns
dataframe.isna().sum()

item_identifier                 0
item_weight                  1463
item_fat_content                0
item_visibility                 0
item_type                       0
item_mrp                        0
outlet_identifier               0
outlet_establishment_year       0
outlet_size                  2410
outlet_location_type            0
outlet_type                     0
item_outlet_sales               0
dtype: int64

### Handling missing values

In [11]:
#handling missing values for item_weight
dataframe['item_type'].unique()

array(['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables',
       'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
       'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
       'Breads', 'Starchy Foods', 'Others', 'Seafood'], dtype=object)

In [12]:
#number of unique values in the column item_type
dataframe['item_type'].nunique()

16

In [13]:
#Function to calculate total null values in a column according to the unique values in the column
#Best suitable for categorical values

column_name = dataframe['item_type'] #column name

def null_value_count_column_item(dataframe, column):
    dataframe_null = {}
    uniqueItems = column.unique()
    sNo = 1
    
    for item in uniqueItems:

        dataframe_null[item] = (dataframe[column_name == item].isna()).sum()
        print(f'{sNo}. {item}')
        print(dataframe_null[item])
        print()
        sNo+=1
    

In [14]:
#calling the function
null_value_count_column_item(dataframe, column_name)

1. Dairy
item_identifier                0
item_weight                  116
item_fat_content               0
item_visibility                0
item_type                      0
item_mrp                       0
outlet_identifier              0
outlet_establishment_year      0
outlet_size                  186
outlet_location_type           0
outlet_type                    0
item_outlet_sales              0
dtype: int64

2. Soft Drinks
item_identifier                0
item_weight                   71
item_fat_content               0
item_visibility                0
item_type                      0
item_mrp                       0
outlet_identifier              0
outlet_establishment_year      0
outlet_size                  133
outlet_location_type           0
outlet_type                    0
item_outlet_sales              0
dtype: int64

3. Meat
item_identifier                0
item_weight                   88
item_fat_content               0
item_visibility                0
item_type       

In [15]:
uniqueItems = dataframe['item_type'].unique()
uniqueItems

array(['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables',
       'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods',
       'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned',
       'Breads', 'Starchy Foods', 'Others', 'Seafood'], dtype=object)

In [16]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   item_identifier            8523 non-null   object 
 1   item_weight                7060 non-null   float64
 2   item_fat_content           8523 non-null   object 
 3   item_visibility            8523 non-null   float64
 4   item_type                  8523 non-null   object 
 5   item_mrp                   8523 non-null   float64
 6   outlet_identifier          8523 non-null   object 
 7   outlet_establishment_year  8523 non-null   int64  
 8   outlet_size                6113 non-null   object 
 9   outlet_location_type       8523 non-null   object 
 10  outlet_type                8523 non-null   object 
 11  item_outlet_sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [17]:
#filling the missing values of item_weight according to the unique values from item_type respectively
uniqueItems = dataframe['item_type'].unique()
selectItem = {}
print('Mean of items in item_type: (The mean of values are different when the values were null and now when filled ) ')
print('This shows the mean after the missing values have been handled')
print()
for item in uniqueItems:
    
    selectItem = (dataframe[dataframe['item_type'] == item])
    fillItemWeight = selectItem['item_weight'].mean()
    print(f"{item} : {fillItemWeight}")
    dataframe['item_weight'] = dataframe['item_weight'].fillna(value=fillItemWeight, inplace=False)


Mean of items in item_type: (The mean of values are different when the values were null and now when filled ) 
This shows the mean after the missing values have been handled

Dairy : 13.42606890459364
Soft Drinks : 12.09932784769921
Meat : 12.943386032009979
Fruits and Vegetables : 13.259571977823414
Household : 13.3915949501029
Baking Goods : 12.47569400820137
Snack Foods : 13.065293006478209
Frozen Foods : 12.957181669198507
Breakfast : 12.893794972695146
Health and Hygiene : 13.191425387333513
Hard Drinks : 11.693776336646742
Canned : 12.495597194923421
Breads : 11.736255930342235
Starchy Foods : 13.658542164072198
Others : 13.7723917452485
Seafood : 12.730217121245584


In [18]:
#missing value from item_weight has been handled
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   item_identifier            8523 non-null   object 
 1   item_weight                8523 non-null   float64
 2   item_fat_content           8523 non-null   object 
 3   item_visibility            8523 non-null   float64
 4   item_type                  8523 non-null   object 
 5   item_mrp                   8523 non-null   float64
 6   outlet_identifier          8523 non-null   object 
 7   outlet_establishment_year  8523 non-null   int64  
 8   outlet_size                6113 non-null   object 
 9   outlet_location_type       8523 non-null   object 
 10  outlet_type                8523 non-null   object 
 11  item_outlet_sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [19]:
dataframe['item_weight'].isna().sum()

0

In [20]:
# Handling the missing values in dataframe['outlet_size']

dataframe['outlet_size'].fillna(dataframe['outlet_size'].mode()[0] , inplace=True) #mode()[0]- get mode of each column

In [21]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   item_identifier            8523 non-null   object 
 1   item_weight                8523 non-null   float64
 2   item_fat_content           8523 non-null   object 
 3   item_visibility            8523 non-null   float64
 4   item_type                  8523 non-null   object 
 5   item_mrp                   8523 non-null   float64
 6   outlet_identifier          8523 non-null   object 
 7   outlet_establishment_year  8523 non-null   int64  
 8   outlet_size                8523 non-null   object 
 9   outlet_location_type       8523 non-null   object 
 10  outlet_type                8523 non-null   object 
 11  item_outlet_sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [22]:
#missing value from outlet_size has been handled
dataframe['outlet_size'].isna().sum()

0

In [48]:
#checking for different unique values in 
dataframe['item_fat_content'].unique()

array([nan], dtype=object)

In [47]:
#Creating a dictionary to map all records of train[['Item_Fat_Content']  to low or regular only respectively.

item_fat={
    'Low Fat': 'low',
    'LF': 'low',
    'low fat': 'low',
    'Regular': 'regular',
    'reg': 'regular', 
}

dataframe['item_fat_content'] = dataframe['item_fat_content'].map(item_fat)

dataframe['item_fat_content'].unique()

array([nan], dtype=object)

In [25]:
# #imputing grade values form the average 
# from sklearn.impute import SimpleImputer

# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# imputer.fit([['Item_Weight']])

# test['Item_Weight'] = imputer.transform(test[['Item_Weight']]).ravel()
# test['Item_Weight'].isna().sum()

In [26]:
#checking for different unique values in train[['Item_Fat_Content']
# train['Item_Fat_Content'].unique()
#trainItemFat = np.unique(train[['Item_Fat_Content']].values)

In [27]:
# #Creating a dictionary to map all records of train[['Item_Fat_Content']  to low or regular only respectively.
# item_fat={
#     'Low Fat': 'low',
#     'LF': 'low',
#     'low fat': 'low',
#     'Regular': 'regular',
#     'reg': 'regular', 
# }

# train['Item_Fat_Content'] = train['Item_Fat_Content'].map(item_fat)
# test['Item_Fat_Content'] = test['Item_Fat_Content'].map(item_fat)

# train['Item_Fat_Content'].unique()

In [28]:
# train['Item_Fat_Content'].unique()

In [29]:
# train['Outlet_Size'].unique()

In [30]:
# train['Outlet_Location_Type'].unique()

In [31]:
# train['Outlet_Type'].unique()

In [32]:
# train['Item_Type'].unique()

In [33]:
# train['Outlet_Identifier'].unique()

In [34]:
# train['Item_Identifier'].nunique()

In [35]:
# train['Item_Weight'].isna().sum()

In [36]:
# train['Outlet_Size'].isna().sum()

In [37]:
# train['Outlet_Size'].isna().sum()

In [38]:
# #Handling the missing values in train['Item_Weight']

# train['Item_Weight'].fillna(train['Item_Weight'].mean(), inplace=True)

In [39]:
# train['Item_Weight'].isna().sum()

In [40]:
# #Handling the missing values in train['Outlet_Size']

# train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0] , inplace=True) #mode()[0]- get mode of each column

In [41]:
# train['Outlet_Size'].isna().sum()

In [42]:
# #checking for outliers
# train.plot(kind='box', subplots=True, layout=(1,7), figsize=(20,7))

In [43]:
# #removing outliers
# train['Item_Visibility']=train[train['Item_Visibility']<0.18]['Item_Visibility']

In [44]:
# #checking for outliers
# #Outliers in Item_Outlet_Sales are neglected as sales can sometime go high suddenly in some seasons

# train.plot(kind='box', subplots=True, layout=(1,7), figsize=(20,7))

In [45]:
# train.describe()