In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Importing Data File and Checking the Data

In [None]:
df = pd.read_csv('../input/big-mart-data/bigmart_data.csv')

df.head()

## Data Preprocessing

In [None]:
## Checking the percentage of null values in each column

df.isnull().sum()/len(df)*100

In [None]:
## As we can see above, Item Weight and Outlet Size are the 2 columns with missing values
## Since Item Weight has only 17% values missing, we can fill in the missing values based on the distribution of its data
## To do this, we will check the average and median values

df.describe()

In [None]:
## Since average and median values for Item_Weight as close, we can say that the values are normally distributed and we can replace 
## the missing values with the average value of the feature.

df["Item_Weight"] = df["Item_Weight"].fillna(12.6)

In [None]:
## Checking for missing values again

df.isnull().sum()/len(df)*100

In [None]:
## Since Outlet_Size variable has 28% missing values, we can go ahead and drop the variable

df = df.drop("Outlet_Size", axis = 1)

In [None]:
df.head()

In [None]:
## Variables like Item_Identifier and Outlet_Identifier can be dropped as it does not add any values to predicting sales

df = df.drop(["Item_Identifier","Outlet_Identifier"], axis = 1)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df["Outlet_Establishment_Year"].value_counts()

In [None]:
df["Outlet_Establishment_Year"] = df["Outlet_Establishment_Year"].astype("category")

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
## from the table above, we can see that Item_Visbility mean and median is not the same. To make this variable normally distributed
## we will perform its log transformation

df["Item_Visibility"] = np.log1p(df["Item_Visibility"])

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
## After working with continuous variables, we want to check the categorical variables for the values. 

sns.countplot(df["Item_Fat_Content"])

In [None]:
## As we can see above, "Low Fat" and "Regular" as stored in the data in different ways. 
## We will work on replacing the values to have uniformity

df["Item_Fat_Content"] = df["Item_Fat_Content"].str.replace("LF","Low Fat")
df["Item_Fat_Content"] = df["Item_Fat_Content"].str.replace("low fat","Low Fat")
df["Item_Fat_Content"] = df["Item_Fat_Content"].str.replace("reg","Regular")

In [None]:
sns.countplot(df["Item_Fat_Content"])

In [None]:
## Checking other categorical variables
chart = sns.countplot(df["Item_Type"])
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')

In [None]:
sns.countplot(df["Outlet_Location_Type"])

In [None]:
chart = sns.countplot(df["Outlet_Type"])
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment='right')

In [None]:
## Looking at the distribution of the categorical variables, we are not sure that the variables are correctly labelled
## We are also able to confirm that the data is spread across the labels and is not biased
## We can now begin the process of seperating the numerical and categorical variables

df.dtypes

In [None]:
df_num = df.select_dtypes(include = "float64")
df_cat = df.select_dtypes(exclude = "float64")

In [None]:
df_num.head()

In [None]:
df_cat.head()

In [None]:
## Dropping the dependent variable for df_num

df_num = df_num.drop("Item_Outlet_Sales", axis = 1)

In [None]:
df_num.head()

In [None]:
## preproces the numerical dataframe

from sklearn.preprocessing import MinMaxScaler
mn = MinMaxScaler()
df_mn = mn.fit_transform(df_num)

In [None]:
df_num_df = pd.DataFrame(df_mn, columns=df_num.columns, index=df_num.index)

In [None]:
df_num_df.head()

In [None]:
## Applying one hot encoding to our categorical variables

df_cat_df = pd.get_dummies(df_cat, drop_first = True)

In [None]:
df_cat_df.head()

In [None]:
df_final = pd.concat([df_num_df,df_cat_df], axis = 1)

In [None]:
df_final.head()

In [None]:
x = df_final
y = df["Item_Outlet_Sales"]

In [None]:
## Spliting our data into Training and Test data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2,random_state = 1)

## Building the Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)

In [None]:
pred = lr.predict(X_test)

## Testing Linear Regression Model

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [None]:
r2_score(y_test,pred)

In [None]:
mean_absolute_error(y_test,pred)

In [None]:
mean_squared_error(y_test,pred)

In [None]:
## The R2 Score tells us that the model we built is able to predict 55% of the variations in the dependent variable Item_Outlet_Sales