In [None]:
import numpy as np
import pandas as pd
import random

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.subplots as sp

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from plotly.subplots import make_subplots

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/bigmart-sales-data/Train.csv
/kaggle/input/bigmart-sales-data/Test.csv


In [None]:
file_name = "/kaggle/input/bigmart-sales-data/Train.csv"
df = pd.read_csv(file_name)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
# Check how many rows & columns in the dataset

df.shape

(8523, 12)

# Data Problem Statement

There are 10 different columns in the dataset

['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type','Outlet_Type','Outlet_Outlet_Sales']

Out 12, 1 Column (Item_Outlet_Sales) is "Target" or "Label" or "y" : Meaning which is what we are predicting given other 11 columns which are called "Inputs" or "Features" or "X".

It's a regression problem as Item_Outlet_Sales is a numerical column.

In [None]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# Data Cleaning

In [None]:
#Checking for Null values

df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [None]:
# Replace null values rows with mean
df["Item_Weight"] = df["Item_Weight"].fillna(df["Item_Weight"].mean())
df.isna().sum()

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [None]:
categories = df['Outlet_Size'].dropna().unique()
df['Outlet_Size'].fillna(random.choice(categories), inplace=True)
df.isna().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

# Data Stats & Distribution

In [None]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
column_types = df.dtypes

numerical_columns = column_types[column_types.apply(pd.api.types.is_numeric_dtype)].index.tolist()
categorical_columns = column_types[column_types.apply(lambda x: not pd.api.types.is_numeric_dtype(x))].index.tolist()

print("Numerical columns = ", numerical_columns)
print("Categorical columns = ", categorical_columns)

Numerical columns =  ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year', 'Item_Outlet_Sales']
Categorical columns =  ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']


In [None]:
selected_columns = df[numerical_columns]
pd.set_option('display.float_format', '{:.6f}'.format)
n_statistics = selected_columns.describe()
n_statistics

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,8523.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.226124,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,9.31,0.026989,93.8265,1987.0,834.2474
50%,12.857645,0.053931,143.0128,1999.0,1794.331
75%,16.0,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


In [None]:
selected_columns = df[categorical_columns]
pd.set_option('display.float_format', '{:.6f}'.format)
c_statistics = selected_columns.describe()
c_statistics

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
count,8523,8523,8523,8523,8523,8523,8523
unique,1559,5,16,10,3,3,4
top,FDW13,Low Fat,Fruits and Vegetables,OUT027,Small,Tier 3,Supermarket Type1
freq,10,5089,1232,935,4798,3350,5577


In [None]:

# Exclude the 'count', 'min', 'max' value from the statistics for all columns
all_column_stats = c_statistics.drop(['count'])

# Create a 3x3 grid of bar plots using Plotly
fig = sp.make_subplots(rows=3, cols=3, subplot_titles=all_column_stats.columns)

# Iterate over each column
for i, column in enumerate(all_column_stats.columns):
    row_num = i // 3 + 1
    col_num = i % 3 + 1

    # Select column statistics
    column_stats = all_column_stats[column]

    # Add bar trace to the subplot
    fig.add_trace(
        go.Bar(
            x=column_stats.index,
            y=column_stats.values,
            marker_color=px.colors.qualitative.Set3,
            texttemplate='%{y:.2s}',
            textposition='outside',
        ),
        row=row_num,
        col=col_num,
    )

    # Update subplot layout
    fig.update_layout(
#         xaxis_title=f'Descriptive Statistics for {column}',
        yaxis_title='Values',
    )

# Update layout for the entire figure
fig.update_layout(showlegend=False, height=900, width=900, title_text="Descriptive Statistics")
fig.show()

In [None]:

# Exclude the 'count', 'min', 'max' value from the statistics for all columns
all_column_stats = n_statistics.drop(['count', 'min', 'max'])

# Create a 3x3 grid of bar plots using Plotly
fig = sp.make_subplots(rows=3, cols=3, subplot_titles=all_column_stats.columns)

# Iterate over each column
for i, column in enumerate(all_column_stats.columns):
    row_num = i // 3 + 1
    col_num = i % 3 + 1

    # Select column statistics
    column_stats = all_column_stats[column]

    # Add bar trace to the subplot
    fig.add_trace(
        go.Bar(
            x=column_stats.index,
            y=column_stats.values,
            marker_color=px.colors.qualitative.Set3,
            texttemplate='%{y:.2s}',
            textposition='outside',
        ),
        row=row_num,
        col=col_num,
    )

    # Update subplot layout
    fig.update_layout(
#         xaxis_title=f'Descriptive Statistics for {column}',
        yaxis_title='Values',
    )

# Update layout for the entire figure
fig.update_layout(showlegend=False, height=900, width=900, title_text="Descriptive Statistics")
fig.show()

In [None]:
# Get descriptive statistics for all columns
all_column_stats = df.describe()

# Exclude the 'count', 'min', 'max' value from the statistics for all columns
all_column_stats = all_column_stats.drop(['count', 'min', 'max'])

# Create a 3x3 grid of bar plots using Plotly
fig = sp.make_subplots(rows=3, cols=3, subplot_titles=all_column_stats.columns)

# Iterate over each column
for i, column in enumerate(all_column_stats.columns):
    row_num = i // 3 + 1
    col_num = i % 3 + 1

    # Select column statistics
    column_stats = all_column_stats[column]

    # Add bar trace to the subplot
    fig.add_trace(
        go.Bar(
            x=column_stats.index,
            y=column_stats.values,
            marker_color=px.colors.qualitative.Set3,
            texttemplate='%{y:.2s}',
            textposition='outside',
        ),
        row=row_num,
        col=col_num,
    )

    # Update subplot layout
    fig.update_layout(
#         xaxis_title=f'Descriptive Statistics for {column}',
        yaxis_title='Values',
    )

# Update layout for the entire figure
fig.update_layout(showlegend=False, height=900, width=900, title_text="Descriptive Statistics")
fig.show()

In [None]:
# Data Distribution Graphs

columns = list(df.columns)[:9]

fig = sp.make_subplots(rows=3, cols=3, subplot_titles=[f'{column}' for column in columns])

for i, column in enumerate(columns):
    row, col = divmod(i, 3)  # Calculate the row and column index
    histogram = go.Histogram(x=df[column], name=column)
    fig.add_trace(histogram, row=row + 1, col=col + 1)

fig.update_layout(height=900, width=950, title_text="Data Distribution")
fig.update_xaxes(tickangle=45)  # Rotate x-axis labels

fig.show()

# Data Visualization

In [None]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
# A bubble scatter plot of Sales price by it's outlet establishment year, with item weight and visibility & Item Outlet Sales

fig = px.scatter(df, x='Item_Weight', y='Item_Visibility', size='Outlet_Establishment_Year',
                 color='Item_Outlet_Sales', color_continuous_scale='jet',
                 labels={'Item_Outlet_Sales': 'Item Outlet Sales'},
                 title='Sales Price Scatterplot',
                 opacity=0.4, size_max=40)

fig.update_layout(coloraxis_colorbar=dict(title='Item Outlet Sales'))
fig.show()

In [None]:
#Item_Outlet_Sales_by_op = df['Item_Outlet_Sales'][:416]
#Item_Outlet_Sales_by_op


In [None]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [None]:
# A bar chart of Average median house value by Ocean Proximity

mean_house_value_by_op = df.groupby("Outlet_Establishment_Year")['Item_Outlet_Sales'].mean()

fig = px.bar(x= mean_house_value_by_op.index , y= mean_house_value_by_op.values, title="Average item outlet sales",
                 labels={'Value': 'Value'},
                template='plotly_white', color=mean_house_value_by_op.index)

fig.update_layout(xaxis_title='Item Visibility', yaxis_title='Item Visibility' )
fig.update_traces(texttemplate='%{y:.2s}', textposition='outside')
fig.show()