# Assigment: Data Collection and Exploratory Data Analysis

This notebook has 2 major parts:
1. Data Collection
2. Data Analysis

<hr>

# Part 1: Data Collection
- Data Scraped: Popular Mobile Phones 📱
- Scraped from: https://www.mysmartprice.com

## To Do Steps:
1. Downloading the required number of pages as HTML files using selium

2. Parsing the HTML files using BeautifulSoup

In [None]:
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [None]:
pip install selenium

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
# Function to setup the web driver for selenium

def set_driver():
  
  chrome_options = Options()
  chrome_options.add_argument('--no-sandbox')
  chrome_options.add_argument('--disable-dev-shm-usage')
  chrome_options.add_argument('--headless')

  driver = webdriver.Chrome(options=chrome_options)

  return driver

## Getting all the phone pages

In [None]:
def get_pages(driver):
    pages = []
    n_pages = int(input("how many pages to scrape: "))
    url = "https://www.mysmartprice.com/mobile/pricelist/mobile-price-list-in-india.html"
    driver.get(url)
    pages.append(BeautifulSoup(driver.page_source, 'html.parser'))
    elements = WebDriverWait(driver, timeout=20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class*="pgntn js-prdct-pgntn"]'))).find_elements_by_css_selector('a[class*="pgntn__item js-pgntn__item"]')
    driver.execute_script("arguments[0].click();", elements[2])
    pages.append(BeautifulSoup(driver.page_source, 'html.parser'))
    found = WebDriverWait(driver, timeout=20).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class*="pgntn js-prdct-pgntn"]')))
    elements = driver.find_elements(By.CSS_SELECTOR, 'a[class*="pgntn__item js-pgntn__item"]')
    found = WebDriverWait(driver, timeout=20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'a[class*="pgntn__item js-pgntn__item"]')))
    driver.execute_script("arguments[0].click();", elements[5])
    pages.append(BeautifulSoup(driver.page_source, 'html.parser'))
    ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,)
    
    
    print('page 1..')
    print('page 2..')
    print('page 3..')
    for i in range(n_pages-3):
        driver.implicitly_wait(10)
        driver.execute_script("window.scrollTo(0, 10000)") 
        pages.append(BeautifulSoup(driver.page_source, 'html.parser'))
        driver.implicitly_wait(100)
        found = WebDriverWait(driver, 30,ignored_exceptions=ignored_exceptions).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class*="pgntn js-prdct-pgntn"]')))
        driver.implicitly_wait(100)
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[class*="pgntn__item js-pgntn__item"]')
        
        try:
            driver.implicitly_wait(10)
            driver.execute_script("arguments[0].click();", elements[6])
        except:
            elements = driver.find_elements(By.CSS_SELECTOR, 'a[class*="pgntn__item js-pgntn__item"]')
            driver.execute_script("arguments[0].click();", elements[6])
        print(f'page {i+4}..')
        
        
    return pages

### Required Attributes:
1. Company name
2. Model name
3. Price
4. Processor
5. Camera
6. Screen
7. RAM
8. ROM
9. OS

In [None]:
def get_df(pages):
    
    models, companies = get_models_companies(pages)
    prices = get_prices(pages)
    processors = get_processors(pages)
    cameras = get_cameras(pages)
    screens = get_screens(pages)
    RAMS = get_rams(pages)
    ROMS = get_roms(pages)
    batteries = get_batteries(pages)
    
    data_dict = {
    'Company': companies,
    'Model': models,
    'CPU': processors,
    'RAM': RAMS,
    'ROM': ROMS,
    'Camera': cameras,
    'Battery': batteries,
    'Display': screens,
    'Price': prices
    }
    
    df = pd.DataFrame(data_dict)
    
    return df

### COMPANY AND MODEL NAMES

In [None]:
def get_models_companies(pages):
    models = []
    companies = []
    for page in pages:
        phones = page.find_all('a', class_ = 'prdct-item__name')
        for phone in phones:
            companies.append(phone.text.strip().split(' ')[0])
            models.append(phone.text.strip())
    i=0
    for company in companies:
        if company == "I":
            companies[i] = "I Kall"
        i=i+1
        
    return models, companies

### PRICES

In [None]:
def get_prices(pages):
    prices = []
    for page in pages:
        phones = page.find_all('span', class_ = 'prdct-item__prc-val')
        for phone in phones:
            prices.append(int(phone.text.strip().replace(',','')))
            
    return prices

### PROCESSORS

In [None]:
def get_processors(pages):
    processors = []
    for page in pages:
        phones = page.find_all('div', class_ = 'prdct-item__spcftn-wrpr')
        for phone in phones:
            processor='NA'
            x=phone.children
            y = next(x).find_all('li',class_='prdct-item__spcftn kyspc__item--cpu')
            if not y:
                processor='NA'
            else:
                processor = y[0].text.strip()
                
            processors.append(processor)
            
    return processors

### Cameras

In [None]:
def get_cameras(pages):
    cameras = []
    for page in pages:
        phones = page.find_all('div', class_ = 'prdct-item__spcftn-wrpr')
        for phone in phones:
            camera='NA'
            y = phone.find_all('li',class_='prdct-item__spcftn kyspc__item--cmra')
            if not y:
                camera='NA'
            else:
                camera = y[0].text.strip()
            cameras.append(camera)
            
    return cameras

### Screens

In [None]:
def get_screens(pages):
    screens = []
    for page in pages:
        phones = page.find_all('div', class_ = 'prdct-item__spcftn-wrpr')
        for phone in phones:
            screen='NA'
            y = phone.find_all('li',class_='prdct-item__spcftn kyspc__item--aspct')
            if not y:
                screen='NA'
            else:
                screen = y[0].text.strip()
                x = screen.find('Screen')
            screens.append(screen[:x-1])
            
    return screens

### RAMS

In [None]:
def get_rams(pages):
    RAMS = []
    for page in pages:
        phones = page.find_all('div', class_ = 'prdct-item__spcftn-wrpr')
        for phone in phones:
            RAM='NA'
            y = phone.find_all('li',class_='prdct-item__spcftn kyspc__item--ram')
            if not y:
                RAM='NA'
            else:
                RAM = y[0].text.strip()
                x = RAM.find('RAM')
            RAMS.append(RAM[:x-1])
            
    return RAMS

### INTERNAL STORAGE (ROM)

In [None]:
def get_roms(pages):
    ROMS = []
    for page in pages:
        phones = page.find_all('div', class_ = 'prdct-item__spcftn-wrpr')
        for phone in phones:
            ROM='NA'
            y = phone.find_all('li',class_='prdct-item__spcftn kyspc__item--strge')
            if not y:
                ROM='NA'
            else:
                ROM = y[0].text.strip()
                x = ROM.find('internal')
            ROMS.append(ROM[:x-1])
    return ROMS

### BATTERIES

In [None]:
def get_batteries(pages):
    batteries = []
    for page in pages:
        phones = page.find_all('div', class_ = 'prdct-item__spcftn-wrpr')
        for phone in phones:
            battery='NA'
            y = phone.find_all('li',class_='prdct-item__spcftn kyspc__item--bttry')
            if not y:
                battery='NA'
            else:
                battery = y[0].text.strip()
                x = battery.find('mAh')
                battery = int(battery[:x-1])
            batteries.append(battery)
    return batteries

### OPERATING SYSTEMS

In [None]:
def get_oss(pages):
    OSS = []
    for page in pages:
        phones = page.find_all('div', class_ = 'prdct-item__spcftn-wrpr')
        for phone in phones:
            OS='NA'
            y = phone.find_all('li',class_='prdct-item__spcftn kyspc__item--os')
            if not y:
                OS='NA'
            else:
                OS = y[0].text.strip()
            OSS.append(OS)
    return OSS

In [None]:
def save_csv(df):
    df.to_csv('phones.csv', index = False)

### Main Code:

In [None]:
print('Setting up the driver...')
driver = set_driver()
print('Done!')

In [None]:
print('Fetching all the pages...') # I have initially fetched 30 pages, so EDTA is based on that
pages = get_pages(driver)
print('Fetched!')

In [None]:
print('Creating the dataframe...')
df = get_df(pages)
print('Created!')

In [None]:
print('Saving as CSV...')
save_csv(df)
print('Saved!')

<hr>

# Step 2: Exploratory Data Analysis

## Loading the data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import copy

In [None]:
data = pd.read_csv('phones.csv')

In [None]:
data.columns

In [None]:
data.head()

<hr>

## Analyzing the company column
- #### Checking how many companies are present
- #### Drawing a pie chart for the top 5 companies

In [None]:
companies = data['Company'].unique()

In [None]:
company_data = data['Company'].value_counts()

In [None]:
company_data

In [None]:
len(companies)

In [None]:
company_top_5 = company_data[:5]

In [None]:
top_5 = company_top_5.index

In [None]:
top_5

In [None]:
colors = sns.color_palette('pastel')[0:5]
plt.pie(company_top_5, labels = top_5, colors = colors, autopct='%.0f%%')
plt.show()

In [None]:
data.describe()

## Summary:
##### - Top 5 Smartphones are:
    1. Samsung Galaxy F62
    2. Xiaomi Redmi 10 Prime
    3. OnePlus Nord 2
    4. Realme 8i
    5. Xiaomi Redmi Note 10S
    

##### - Top 5 companies:
  1. Samsung: 211 times
  2. Xiomi: 129 times
  3. Nokia: 97 times
  4. OPPO: 96 times
  5. Realme: 89 times
  
    
##### - Most popular price point seems to be around 20k    
##### - There are 77 unique companies
##### - Surprisingly Honor is at the bottom zone, probably because of the google services ban
##### - Nokia still honds some of the share of the most popular smartphones

<hr>

## Analyzing the price column

In [None]:
prices = copy.deepcopy(data['Price'])

In [None]:
prices.describe()

In [None]:
plt.xlabel('Price')
plt.ylabel('Count')
plt.title('Price Distribution of Famous Smartphones')
plt.grid(True)
plt.hist(prices)
plt.show()

In [None]:
top_5_prices = data['Price'].value_counts().head()

In [None]:
labels = list(top_5_prices.index)

In [None]:
i=0
for label in labels:
    labels[i] = str(label) + ' Rs'
    i=i+1

In [None]:
plt.title('Top 5 Prices')
plt.pie(top_5_prices, labels = labels, autopct='%.0f%%')

### Summary:

- Majority of the smartphones lie within the range of 0-12500 RS which can also be considered as the budget range
- 8999 Rs price point seems to be the most popular among consumers

<hr>

## Analyzing the RAM column
- #### Checking whats the most frequent range of RAMS used.
- #### Checking how much RAM the top 3 companies preferrably provides in their phones.

In [None]:
RAMS = copy.deepcopy(data['RAM'])

In [None]:
i=0
for RAM in RAMS:
    if type(RAM)==type('s'):
        if RAM.find('MB')!=-1:
            RAMS[i] = float(RAM.replace('MB', '').strip())*0.001
    i=i+1

In [None]:
i=0
for RAM in RAMS:
    if type(RAM)==type('s'):
        if RAM.find('GB')!=-1:
            RAMS[i] = float(RAM.replace('GB', '').strip())
    i=i+1

In [None]:
RAMS

In [None]:
RAM_data = RAMS.value_counts()

In [None]:
print(RAM_data)

In [None]:
RAM_top_5 = RAM_data[:5]

In [None]:
labels_RAM_pie = RAM_top_5.index

In [None]:
labels_RAM_pie = list(labels_RAM_pie)

In [None]:
i=0
for label in labels_RAM_pie:
    z = label
    z = str(label) + ' GB'
    labels_RAM_pie[i] = z
    i=i+1

In [None]:
colors = sns.color_palette('pastel')[0:5]
plt.pie(RAM_top_5, labels = labels_RAM_pie, colors=colors, autopct='%.0f%%')
plt.show()

In [None]:
data['Company']

In [None]:
data.loc[lambda data: data['Company'] == 'Samsung']['RAM'].value_counts()

### Summary:
##### - Most used RAM variants seem to be:
   1. --> 8 GB
   2. --> 6 GB
   3. --> 4 GB

<hr>

### RAM statistics for each of the top 3 companies:

#### 1. Samsung

In [None]:
RAM_samsung_top_5 = data.loc[lambda data: data['Company'] == 'Samsung']['RAM'].value_counts()[:5]

In [None]:
RAM_samsung_top_5

In [None]:
labels = RAM_samsung_top_5.index

In [None]:
colors = sns.color_palette('pastel')[0:5]
plt.pie(RAM_samsung_top_5, labels = labels, colors=colors, autopct='%.0f%%')
plt.show()

#### - Samsung's top 3 RAM variants are the same the trend of top 3 RAM variants: 
- (1) 8 GB
- (2) 6 GB
- (3) 4 GB

#### 2. Xiaomi

In [None]:
RAM_Xiaomi_top_5 = data.loc[lambda data: data['Company'] == 'Xiaomi']['RAM'].value_counts()[:5]

In [None]:
RAM_Xiaomi_top_5

In [None]:
labels = RAM_Xiaomi_top_5.index

In [None]:
colors = sns.color_palette('pastel')[0:5]
plt.pie(RAM_Xiaomi_top_5, labels = labels, colors=colors, autopct='%.0f%%')
plt.show()

#### - Xiaomi's top 3 RAM variants are: 
- (1) 6 GB
- (2) 4 GB
- (3) 3 GB

#### 3. Nokia

In [None]:
RAM_nokia_top_5 = data.loc[lambda data: data['Company'] == 'Nokia']['RAM'].value_counts()[:5]

In [None]:
RAM_nokia_top_5

In [None]:
labels = RAM_nokia_top_5.index

In [None]:
colors = sns.color_palette('pastel')[0:5]
plt.pie(RAM_nokia_top_5, labels = labels, colors=colors, autopct='%.0f%%')
plt.show()

#### - Nokia's top 3 RAM variants are: 
- (1) 6 GB
- (2) 4 MB
- (3) 4 GB

### Interesting Points:
- Nokia still seems to be making alot of non-android smartphones since 22% of the phones have 4 MB RAM and 18% of the phones have 512 MB

<hr>

## Analyzing the battery column
- #### Checking whats the most frequent range of batteries used.
- #### Checking how much battery the top 3 companies preferrably provides in their phones.

In [None]:
batteries = copy.deepcopy(data['Battery'])

In [None]:
batteries.value_counts()

In [None]:
top_5_batteries = batteries.value_counts().head()

In [None]:
labels = list(top_5_batteries.index)

In [None]:
top_5_batteries = list(top_5_batteries)

In [None]:
i=0
for label in labels:
    labels[i]=str(int(label)) + ' mAh'
    i=i+1

In [None]:
plt.title('Top 5 Batteries')
plt.pie(top_5_batteries, labels = labels, autopct='%.0f%%')

### Top 5 batteries seem to be:
1. 5000 mAh
2. 4000 mAh
3. 3000 mAh
4. 6000 mAh
5. 4500 mAh

<hr>

### Analyzing for each of the top 3 companies now

#### 1. Samsung

In [None]:
battery_samsung_top_5 = data.loc[lambda data: data['Company'] == 'Samsung']['Battery'].value_counts().head()

In [None]:
battery_samsung_top_5

In [None]:
labels = battery_samsung_top_5.index

In [None]:
labels = list(labels)

In [None]:
i=0
for label in labels:
    labels[i]=str(int(label)) + ' mAh'
    i=i+1

In [None]:
colors = sns.color_palette('pastel')[0:5]
plt.pie(battery_samsung_top_5, labels = labels, colors=colors, autopct='%.0f%%')
plt.show()

#### 2. Xiaomi

In [None]:
battery_xiaomi_top_5 = data.loc[lambda data: data['Company'] == 'Xiaomi']['Battery'].value_counts().head()

In [None]:
battery_xiaomi_top_5

In [None]:
labels = battery_xiaomi_top_5.index

In [None]:
labels = list(labels)

In [None]:
i=0
for label in labels:
    labels[i]=str(int(label)) + ' mAh'
    i=i+1

In [None]:
colors = sns.color_palette('pastel')[0:5]
plt.pie(battery_xiaomi_top_5, labels = labels, colors=colors, autopct='%.0f%%')
plt.show()

#### 3. Nokia

In [None]:
battery_nokia_top_5 = data.loc[lambda data: data['Company'] == 'Nokia']['Battery'].value_counts().head()

In [None]:
battery_nokia_top_5

In [None]:
labels = battery_nokia_top_5.index

In [None]:
labels = list(labels)

In [None]:
i=0
for label in labels:
    labels[i]=str(int(label)) + ' mAh'
    i=i+1

In [None]:
colors = sns.color_palette('pastel')[0:5]
plt.pie(battery_nokia_top_5, labels = labels, colors=colors, autopct='%.0f%%')
plt.show()

## Summary:

##### - Top 5 RAM sizes to be used among the most famous smartphones are:
1. -> 4 GB
2. -> 6 GB
3. -> 8 GB
4. -> 3 GB
5. -> 2 GB

<hr>

### Notes for future:
1. Using numpy arrays might be faster instead of Python lists
2. Making selenium faster
3. Implementing a delay between each driver.get(url)

### Questions:
1. How many companies are there? 
2. Which top 5 companies occur the most frequently?
3. Which phone company offers the best bang for the buck?