## Importing Libraries 

In [43]:
import pandas as pd
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## Reading Dataset


In [44]:
df = pd.read_csv("/Users/Sai Kumar/code asylum/datasets.csv")

In [45]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


## Data Cleaning

In [46]:
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [47]:
df['Item_Weight'].fillna(df['Item_Weight'].mean(), inplace=True)

In [48]:
df.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [49]:
df.select_dtypes(include=['object']).columns

Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')

In [50]:
for i in df.select_dtypes(include=['object']).columns:
    print((df[i].value_counts()/len(df))*100)

FDW13    0.117330
FDG33    0.117330
FDU12    0.105597
FDW49    0.105597
FDV38    0.105597
           ...   
FDO33    0.011733
DRF48    0.011733
FDT35    0.011733
FDN52    0.011733
FDK57    0.011733
Name: Item_Identifier, Length: 1559, dtype: float64
Low Fat    59.709023
Regular    33.896515
LF          3.707615
reg         1.372756
low fat     1.314091
Name: Item_Fat_Content, dtype: float64
Fruits and Vegetables    14.455004
Snack Foods              14.079549
Household                10.676992
Frozen Foods             10.043412
Dairy                     8.001877
Canned                    7.614690
Baking Goods              7.602957
Health and Hygiene        6.101138
Soft Drinks               5.221166
Meat                      4.986507
Breads                    2.944972
Hard Drinks               2.510853
Others                    1.982870
Starchy Foods             1.736478
Breakfast                 1.290625
Seafood                   0.750909
Name: Item_Type, dtype: float64
OUT027    10.9

In [51]:
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF': 'Low Fat', 'reg': 'Regular', 'low fat':'Low Fat'})

In [52]:
df['Item_Fat_Content'].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [53]:
for i in df.select_dtypes(include=['object']).columns:
    print((df[i].value_counts()/len(df))*100)

FDW13    0.117330
FDG33    0.117330
FDU12    0.105597
FDW49    0.105597
FDV38    0.105597
           ...   
FDO33    0.011733
DRF48    0.011733
FDT35    0.011733
FDN52    0.011733
FDK57    0.011733
Name: Item_Identifier, Length: 1559, dtype: float64
Low Fat    64.730729
Regular    35.269271
Name: Item_Fat_Content, dtype: float64
Fruits and Vegetables    14.455004
Snack Foods              14.079549
Household                10.676992
Frozen Foods             10.043412
Dairy                     8.001877
Canned                    7.614690
Baking Goods              7.602957
Health and Hygiene        6.101138
Soft Drinks               5.221166
Meat                      4.986507
Breads                    2.944972
Hard Drinks               2.510853
Others                    1.982870
Starchy Foods             1.736478
Breakfast                 1.290625
Seafood                   0.750909
Name: Item_Type, dtype: float64
OUT027    10.970316
OUT013    10.935117
OUT035    10.911651
OUT049    10.9116

In [54]:
for i in df.select_dtypes(include=['float64', 'int64']).columns:
    print((df[i].describe()))

count    8523.000000
mean       12.857645
std         4.226124
min         4.555000
25%         9.310000
50%        12.857645
75%        16.000000
max        21.350000
Name: Item_Weight, dtype: float64
count    8523.000000
mean        0.066132
std         0.051598
min         0.000000
25%         0.026989
50%         0.053931
75%         0.094585
max         0.328391
Name: Item_Visibility, dtype: float64
count    8523.000000
mean      140.992782
std        62.275067
min        31.290000
25%        93.826500
50%       143.012800
75%       185.643700
max       266.888400
Name: Item_MRP, dtype: float64
count    8523.000000
mean     1997.831867
std         8.371760
min      1985.000000
25%      1987.000000
50%      1999.000000
75%      2004.000000
max      2009.000000
Name: Outlet_Establishment_Year, dtype: float64
count     8523.000000
mean      2181.288914
std       1706.499616
min         33.290000
25%        834.247400
50%       1794.331000
75%       3101.296400
max      13086.964800
N

## Feature Engineering

In [55]:
df['Item_Outlet_Sales'].sum()

18591125.410399996

In [56]:
df['Item_Visibility'] = df['Item_Visibility'].replace({0:df['Item_Visibility'].mean()})

In [57]:
enc = OneHotEncoder(handle_unknown='ignore')
X = pd.DataFrame(enc.fit_transform(df[['Outlet_Identifier']]).toarray())

In [58]:
X[['Item_Visibility', 'Item_MRP']] = df[['Item_Visibility', 'Item_MRP']]

In [59]:
y = pd.DataFrame(df['Item_Outlet_Sales'])

## Linear Regression 

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [61]:
reg = LinearRegression().fit(X_train, y_train)

In [62]:
y_pred = reg.predict(X_test)

In [63]:
y_pred

array([[ 2484.94615606],
       [ 2534.04394227],
       [ 2224.44697141],
       ...,
       [ 4169.87612912],
       [-1004.4149807 ],
       [ 5365.85746393]])

In [64]:
y_test

Unnamed: 0,Item_Outlet_Sales
4931,1426.1436
4148,1201.7690
7423,1836.2764
4836,2410.8618
944,1549.9824
...,...
4644,3235.7880
6179,555.2772
1861,2885.5772
3598,218.3824


In [65]:
reg.score(X_train,y_train)

0.5629507174114475