In [2]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# utility to print any error passed
def log_error(e):
    print(e)

In [4]:
# check if the response returned is good or not
def is_good_response(resp):
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)

In [5]:
# utility to get html content from a given url
def simple_get(url):
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else :
                return None
    except RequestException as re:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

In [6]:
# JioMart data scraping - Dals and Pulses

raw_html_dp = simple_get('https://www.jiomart.com/category/staples/dals-pulses')

# get html parsed using Beautiful soup
html_soup = BeautifulSoup(raw_html_dp, 'html.parser')

# print(html_soup)

In [7]:
containers = html_soup.findAll("div", {"class" : "col-md-3 p-0"})
# print (BeautifulSoup.prettify(containers[0]))

In [8]:
## we can write the data to a csv or json file

#filename = "JioMart_1.csv"
#f = open(filename, "w")
#headers = "Product_Name,Product_Quantity,MRP_Price,Offer_Price,Vendor,Category\n"
#f.write(headers)

In [9]:
ProductNamesList = []
ProductQtyList = []
MRP_List = []
Offer_Price_List = []
Vendor_List = []
Category_List = []

for container in containers:
    product = container.div.img["alt"]
    ProductNamesList.append(product)
    
    price_container = container.findAll("strike", {"id" : "price"})
    price = price_container[0].text.strip()
    MRP_List.append(price)
    
    final_price_container = container.findAll("span", {"id" : "final_price"})
    final_price = final_price_container[0].text.strip()
    Offer_Price_List.append(final_price)
    
    #print ("Product:" + product)
    #print ("Price:" + price)
    #print ("Final Price:" + final_price)
    
    Vendor_List.append('JioMart')
    Category_List.append('Dals-Vegetables')
    
#     print (product + "," + price + "," + final_price + ",JioMart" + ",Dals-Vegetables" +"\n")
#     f.write(product + "," + price + "," + final_price + ",JioMart" + ",Dals-Vegetables" +"\n")

    
# print('ProductNamesList =>', ProductNamesList)
# print('MRP_List =>', MRP_List)
# print('Offer_Price_List =>', Offer_Price_List)
# print('Vendor_List =>', Vendor_List)
# print('Category_List =>', Category_List)

# f.close()

In [10]:
import pandas as pd

table_dict = { 'Product_Name' : ProductNamesList,
                'MRP_Price' : MRP_List,
                'Offer_Price' : Offer_Price_List,
                'Vendor' : Vendor_List,
                'Category' : Category_List}

df = pd.DataFrame(table_dict)

df.head()

Unnamed: 0,Product_Name,MRP_Price,Offer_Price,Vendor,Category
0,Tur / Arhar Dal 2 kg,₹ 280.00,₹ 190.00,JioMart,Dals-Vegetables
1,Raw Peanuts 1 Kg (Loose),₹ 127.00,₹ 125.00,JioMart,Dals-Vegetables
2,Good Life Tur Dal 1 kg,₹ 137.00,₹ 108.00,JioMart,Dals-Vegetables
3,Good Life Tur Dal Oily 1 kg,₹ 133.00,₹ 116.00,JioMart,Dals-Vegetables
4,Good Life Moong Dal Chilka 500 g,₹ 84.00,₹ 71.00,JioMart,Dals-Vegetables


In [11]:
df.shape

(20, 5)

In [12]:
price_qty_mixed=df.Product_Name.str.split(expand=True,)
print(price_qty_mixed)
print(type(price_qty_mixed))

        0        1        2        3           4     5       6      7     8  \
0     Tur        /    Arhar      Dal           2    kg    None   None  None   
1     Raw  Peanuts        1       Kg     (Loose)  None    None   None  None   
2    Good     Life      Tur      Dal           1    kg    None   None  None   
3    Good     Life      Tur      Dal        Oily     1      kg   None  None   
4    Good     Life    Moong      Dal      Chilka   500       g   None  None   
5   Loose     Toor      Dal    Value           1    kg    None   None  None   
6    Pick        N     Cook  Premium       Moong   Dal       1     kg  None   
7    Good     Life     Urad      Dal         500     g    None   None  None   
8    Good     Life     Urad      Dal      Chilka   500       g   None  None   
9   Loose    Chana      Dal        1          kg  None    None   None  None   
10   Pick        N     Cook     Maha        Toor   Dal       1     kg  None   
11   Good     Life    Chana      Dal         500    

In [13]:
qty_df = df.Product_Name.str.extract('(\d+)')

df['Product_Quantity'] = qty_df

In [14]:
df['Product_Name'] = df['Product_Name'].str[:16]

In [15]:
import numpy as np

newdf = df[['Product_Name', 'Product_Quantity', 'MRP_Price', 'Offer_Price', 'Vendor', 'Category']].copy()

newdf['Product_Quantity'] = pd.to_numeric(df['Product_Quantity'])

# newdf['Product_Quantity'] = newdf['Product_Quantity'].filter().div(1000)

a = np.array(newdf['Product_Quantity'].values.tolist())
# print (a)

newdf['Product_Quantity'] = np.where(a > 100, a/1000, a).tolist()

newdf['MRP_Price'] = newdf['MRP_Price'].map(lambda x: x.lstrip('₹').strip())
newdf['Offer_Price'] = newdf['Offer_Price'].map(lambda x: x.lstrip('₹').strip())

newdf.head()
newdf.describe()

newdf['MRP_Price'] = pd.to_numeric(newdf['MRP_Price'])
newdf['Offer_Price(Rs)'] = pd.to_numeric(newdf['Offer_Price'])


### Exploratory Data Analysis

In [16]:
## Observations:
## With a quick look we can observer the following:
## 1. There is one dependent variable(Y) or Response which is column 'Cost' 
## 2. There are 5 independent variables or features as Height', Width', 'Length', 'Weight', 'Weight1'
## 3. The given Regression problem is Multi-Variate Regression. 
## 4. The Linear Regression equation is of form: Y = α + β1X1 + β2X2 + β3X3
## The selection of features plays the most important role in multivariate regression

In [17]:
newdf.head()

Unnamed: 0,Product_Name,Product_Quantity,MRP_Price,Offer_Price,Vendor,Category,Offer_Price(Rs)
0,Tur / Arhar Dal,2.0,280.0,190.0,JioMart,Dals-Vegetables,190.0
1,Raw Peanuts 1 Kg,1.0,127.0,125.0,JioMart,Dals-Vegetables,125.0
2,Good Life Tur Da,1.0,137.0,108.0,JioMart,Dals-Vegetables,108.0
3,Good Life Tur Da,1.0,133.0,116.0,JioMart,Dals-Vegetables,116.0
4,Good Life Moong,0.5,84.0,71.0,JioMart,Dals-Vegetables,71.0


In [18]:
newdf.describe()

Unnamed: 0,Product_Quantity,MRP_Price,Offer_Price(Rs)
count,20.0,20.0,20.0
mean,0.885,116.35,95.55
std,0.465974,60.162302,43.648687
min,0.2,35.0,25.0
25%,0.5,76.75,66.75
50%,1.0,96.5,83.5
75%,1.0,138.75,122.75
max,2.0,280.0,190.0


In [19]:
newdf.shape

(20, 7)

In [20]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Product_Name      20 non-null     object 
 1   Product_Quantity  20 non-null     float64
 2   MRP_Price         20 non-null     float64
 3   Offer_Price       20 non-null     object 
 4   Vendor            20 non-null     object 
 5   Category          20 non-null     object 
 6   Offer_Price(Rs)   20 non-null     float64
dtypes: float64(3), object(4)
memory usage: 864.0+ bytes


In [21]:
newdf.MRP_Price.unique()

array([280., 127., 137., 133.,  84., 101., 226.,  80.,  79.,  68., 144.,
        49., 132., 149.,  70.,  35.,  91., 180.,  92.])

In [22]:
newdf.Offer_Price.unique()

array(['190.00', '125.00', '108.00', '116.00', '71.00', '95.00', '180.00',
       '70.00', '68.00', '61.00', '120.00', '40.00', '130.00', '129.00',
       '63.00', '56.00', '25.00', '122.00', '72.00'], dtype=object)

In [23]:
newdf.MRP_Price.value_counts()

70.0     2
92.0     1
79.0     1
127.0    1
137.0    1
133.0    1
84.0     1
101.0    1
226.0    1
80.0     1
68.0     1
180.0    1
144.0    1
49.0     1
132.0    1
149.0    1
35.0     1
91.0     1
280.0    1
Name: MRP_Price, dtype: int64

In [24]:

newdf.nunique()

Product_Name        17
Product_Quantity     4
MRP_Price           19
Offer_Price         19
Vendor               1
Category             1
Offer_Price(Rs)     19
dtype: int64

In [25]:
# check null or missing value

newdf.isnull().values.any()

False

In [26]:
newdf.groupby(['Category','Product_Name']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Product_Quantity,MRP_Price,Offer_Price(Rs)
Category,Product_Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dals-Vegetables,Chana Dal 2 kg,2.0,180.0,122.0
Dals-Vegetables,Good Life Brown,1.0,91.0,70.0
Dals-Vegetables,Good Life Chana,0.5,49.0,40.0
Dals-Vegetables,Good Life Moong,1.0,176.0,143.0
Dals-Vegetables,Good Life Roaste,0.2,35.0,25.0
Dals-Vegetables,Good Life Toor D,0.5,70.0,56.0
Dals-Vegetables,Good Life Tur Da,2.0,270.0,224.0
Dals-Vegetables,Good Life Urad D,1.0,159.0,138.0
Dals-Vegetables,Loose Chana Dal,1.0,68.0,61.0
Dals-Vegetables,Loose Moong Dal,1.0,132.0,130.0


In [27]:

Q1 = newdf.quantile(0.25)
Q3 = newdf.quantile(0.75)
IQR = Q3 - Q1

print("IQR:")
print(IQR)

lower_bound = Q1 -(1.5 * IQR) 
upper_bound = Q3 +(1.5 * IQR) 

print()
print("Lower bound:")
print(lower_bound)
print()
print("Upper bound:")
print(upper_bound)

IQR:
Product_Quantity     0.5
MRP_Price           62.0
Offer_Price(Rs)     56.0
dtype: float64

Lower bound:
Product_Quantity    -0.25
MRP_Price          -16.25
Offer_Price(Rs)    -17.25
dtype: float64

Upper bound:
Product_Quantity      1.75
MRP_Price           231.75
Offer_Price(Rs)     206.75
dtype: float64


In [28]:
## removing Outliers using IQR
def remove_outlier(df, low, high):
    #low = .05
    #high = .95
    quant_df = df.quantile([low, high])
    for name in list(df.columns):
#         if is_numeric_dtype(lugg_df[name]):
        df = df[(df[name] > quant_df.loc[low, name]) 
        & (df[name] < quant_df.loc[high, name])]
    return df

In [29]:
# After removing outliers using Z-Score calculated above
newdf = newdf[(z < 3).all(axis=1)]
newdf.shape

NameError: name 'z' is not defined

In [None]:
## Finding Correlation and depicting using Heat Map

import seaborn as sb

correlation_matrix = np.corrcoef(newdf)
print("Correlation Matrix:")
print("Correlation of Cost with Weight:", correlation_matrix[0,1])
print("Correlation of Cost with Volume:", correlation_matrix[0,2])
sb.heatmap(data=correlation_matrix, annot=True)

In [None]:
# Finding Correlation between Cost and Weight
import matplotlib.pyplot as plt

fig, ax  = plt.subplots(figsize=(12, 8))
ax.scatter(newdf['Offer_Price'], newdf['MRP_Price'])
ax.set_xlabel('MRP_Price')
ax.set_ylabel('Offer_Price')
plt.show()

In [None]:
## Distribution Plot for Cost (the dependent variable)
sb.distplot(newdf['Offer_Price'])

In [None]:
## Distribution Plot for Cost (the dependent variable)
sb.distplot(newdf['MRP_Price'])

In [None]:
## Distribution Plot for Cost (the dependent variable)
sb.distplot(newdf['Product_Quantity'])