<a href="https://colab.research.google.com/github/pylabview/sales-predicctions/blob/main/Project_1_Revisited.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [None]:
## Our standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as miss

## Preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

## Models & evaluation metrics
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import joblib

## Colab imports
from google.colab import drive
#Setting plots styles
plt.style.use(['dark_background','seaborn-muted', 'seaborn-poster'])
#Moubt Gogle Drive
drive.mount('/content/drive')

## setting random state for reproducibility
SEED = 321
np.random.seed(SEED)
## Matplotlib style
fav_style = ('ggplot','tableau-colorblind10')
fav_context  ={'context':'notebook', 'font_scale':1.1}
plt.style.use(fav_style)
sns.set_context(**fav_context)
plt.rcParams['savefig.transparent'] = False
plt.rcParams['savefig.bbox'] = 'tight'

  plt.style.use(['dark_background','seaborn-muted', 'seaborn-poster'])
  plt.style.use(['dark_background','seaborn-muted', 'seaborn-poster'])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Loading Data

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Assignments/Data/sales_predictions.csv")
df.head()a

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


## Cleaning Data

In [None]:
# Are there duplicates? If so, drop any duplicates.
dups = df.duplicated().sum()
print(f"Number of duplicates: {dups}")

Number of duplicates: 0


In [None]:
# 4) Identify missing values.
total_missing_series = df.isna().sum()
total_missing_series_idx = df.isna().sum().index
total_cols = len(total_missing_series_idx )

for i in range(total_cols):
  if total_missing_series[i] > 0:
    print(f"Row index {total_missing_series_idx[i]}, number of missing {total_missing_series[i]}")

Row index Item_Weight, number of missing 1463
Row index Outlet_Size, number of missing 2410


In [None]:
# Inputing missing data for Item_Weight with the column average, as the data type is a float
df['Item_Weight'].fillna(value = df['Item_Weight'].mean(), inplace = True)

#Dropping Outlet_Size, the misssing values are >5% and this column should not affect the food sales prediction

df.drop(columns=['Outlet_Size'],inplace=True)



In [None]:
# 6) Confirm that there are no missing values after addressing them.
print(f"Are All missing values removed? {df.isna().sum().sum()==0}")

Are All missing values removed? True


In [None]:
# 7) Find and fix any inconsistent categories of data 
## Item_Fat_Content

df['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [None]:
#Repalcing inconsisten values: 'low fat', 'LF', 'reg'
df.replace({'low fat': 'Low Fat',
            'LF': 'Low Fat',
            'reg':'Regular'}, inplace = True)

In [None]:
#Cheking everything is OK!

df['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular'], dtype=object)

In [None]:
## Fixing Data Type for Outlet_Establishment_Year from float64 to datetime64

df['Outlet_Establishment_Year'] = pd.to_datetime(df['Outlet_Establishment_Year'])

In [None]:
#8) For any numerical columns, obtain the summary statistics of each (min, max, mean).
df.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales
count,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,2181.288914
std,4.226124,0.051598,62.275067,1706.499616
min,4.555,0.0,31.29,33.29
25%,9.31,0.026989,93.8265,834.2474
50%,12.857645,0.053931,143.0128,1794.331
75%,16.0,0.094585,185.6437,3101.2964
max,21.35,0.328391,266.8884,13086.9648


<a name="3"></a>
# 📊 Project 1 - Part 3