In [1]:
# Importing neccessary packages

import requests as rqst
from bs4 import BeautifulSoup as bs
import sqlite3
import datetime as dt
import re

In [2]:
baseUrlToBeLoaded = "https://www.digit.in/"

In [3]:
# Creating soup object for every html loaded, so that it can be parsed 

def get_soup_object(soup_link):
    mobilesPage = rqst.get(soup_link)
    mobilesPageSoup = bs(mobilesPage.text, 'html.parser')
    return mobilesPageSoup

In [None]:
# Parsing the homepage of the website and getting redirect link to mobile information page

homePageSoup = get_soup_object(soup_link=baseUrlToBeLoaded)
mobile_page_redirect = [link.get('href') for link in homePageSoup.find_all('a') if link.text.strip() == 'Latest Mobiles']

In [None]:
# Getting redirect links for every mobile phone present in the website.

redirects = []
def mobile_redirects(page_link):
    pageObjects = get_soup_object(soup_link=page_link)
    for link in pageObjects.find_all('div', class_ = 'head-line'):
        redirects.append(link.find('a').get('href'))
    return pageObjects

In [None]:
# Getting number of mobile phones present in the site to calculate no of pages to be loaded.

soupObject = get_soup_object(soup_link=mobile_page_redirect[0])
no_of_results = int(soupObject.find('div', class_ = 'block-feature').find('h5').text.strip().split()[0])
no_of_pages = no_of_results//10

# Loading every page and getting redirect links for every mobile present in that page.

for page_no in range(1,no_of_pages+1):
    pageSoup = soupObject
    page_links = [link.get('href') for link in pageSoup.find_all('a', class_ = 'page-link') 
                  if link.text.strip()== str(page_no)]
    new_page_soup = mobile_redirects(page_link=page_links[0])
    soupObject = new_page_soup

In [None]:
''' Writing redirect links to a text file. So if program got terminated or interruped and data in the redirect links list
get lost. To avoid repeating over to get redirect links it is wriiten to file'''

with open("redirect_links.txt", "w") as output:
    for link in redirects:
        output.write(link + "\n")

In [4]:
# reading all redirect links from the text file saved.

file = open( "redirect_links.txt", "r" )
redirect_links = []
for line in file:
    redirect_links.append(line)

In [5]:
# Important specs are only considered to select as the feature for the model.

specs_interested = ['Manufacturer', 'Operating System', 'Os Version', 'Screen Size (In Inches)', 'Display Technology',
                    'Screen Resolution (In Pixels)', 'Pixel Density (Ppi)','Rear Camera Megapixel', 
                    'Front Camera Megapixel', 'Battery Capacity (Mah)', 'Support For Fast Charging', 'Cpu Speed',
                    'Processor Cores', 'Ram', 'Gpu', 'Weight (In Grams)', 'Storage', 'Removable Storage (Maximum)']

In [6]:
# Creating database and table.

dbConnection = sqlite3.connect('MobileDeviceData.sqlite')
dbCursor = dbConnection.cursor()

dbCursor.executescript('''
drop table if exists MobileData;

create table MobileData(
    brandName text,
    os text,
    osVersion text,
    screenSizeInInches text,
    displayType text,
    resolutionInPixels text,
    pixelDensity text,
    rearCamera text,
    frontCamera text,
    batteryCapacity text,
    fastCharging text,
    cpuSpeed text,
    processorCores text,
    ramCapacity text,
    gpuType text,
    weight text,
    internalStorage text,
    externalStorage text,
    yearOfRelease text,
    price integer
);
    
''')

<sqlite3.Cursor at 0x1c738ce5dc0>

In [7]:
# Getting all the specification of the mobile available in the mobile page.

def getting_spec_list(tables):
    spec_list = {}
    for table in tables:
        table_body = table.find('tbody')
        for row in table_body.find_all('tr'):
            datas = row.find_all('td')
            cols = [data.text.strip() for data in datas]
            spec_list[cols[0]] = cols[2]
    return spec_list

In [8]:
# Getting the data of the specifications we are interest in.

def get_spec_data(specs):
    specs_data = []
    for spec in specs_interested:
        if spec in specs.keys():
            specs_data.append(specs[spec])
        else:
            specs_data.append('NA')
    return specs_data

In [9]:
# Getting price along with the other specifications.

def get_all_data(soupObject, year):
    if soupObject.find('div', class_ = 'Block-price') != None:
        price = soupObject.find('div', class_ = 'Block-price').find('b').text.strip()
        price = int(''.join(re.findall('\d*[,]?\d+', price)[0].split(',')))
    else:
        return
    tables_info = soupObject.find('div', class_ = 'Basic-info-tab').find_all('table')
    spec_list = getting_spec_list(tables=tables_info)
    spec_data = get_spec_data(specs=spec_list)
    dbCursor.execute('''insert or ignore into MobileData (brandName,os,osVersion,screenSizeInInches,displayType,
                        resolutionInPixels,pixelDensity,rearCamera,frontCamera,batteryCapacity,fastCharging,cpuSpeed,
                        processorCores,ramCapacity,gpuType,weight,internalStorage,externalStorage,yearOfRelease,price) 
                        values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)''', 
                     (spec_data[0],spec_data[1],spec_data[2],spec_data[3],spec_data[4],spec_data[5],spec_data[6],
                      spec_data[7],spec_data[8],spec_data[9],spec_data[10],spec_data[11],spec_data[12],spec_data[13],
                      spec_data[14],spec_data[15],spec_data[16],spec_data[17],year,price,))
    dbConnection.commit()

In [11]:
# Scrapping only the mobile phones that are released five years back till now 
# Scrapping the data and also scrappping all the variants available for the mobile phone.

for mobile in redirect_links:
    productSoup = get_soup_object(soup_link=mobile[:-1])
    current_year = dt.datetime.now().year
    years_considered = [year for year in range(current_year, current_year-3, -1)]
    temp = productSoup.find_all('div', class_ = 'Block-status')
    if len(temp) != 0:
        release_year = int(temp[-2].find('b').text.strip().split('-')[-1])
    else: continue
    if release_year in years_considered:
        variants_link = [link.find('a').get('href') for link in productSoup.find_all('div', class_ = 'Variant-size') 
                         if link.find('a') != None]
        get_all_data(soupObject=productSoup, year=release_year)
        if len(variants_link) != 0:
            for variant in variants_link:
                soup_object = get_soup_object(soup_link=variant)
                get_all_data(soupObject=soup_object, year=release_year)