# Web Scrapping

In [1]:
'''
Q1. From all the iOS pages (ending with “_ios.html”) in the Data folder, extract 
    (1) number of customer ratings in the Current Version (let’s call it ios_current_ratings); and
    (2) number of customer ratings in All Versions (ios_all_ratings). 
'''
    
# For example, the extracted values should be: 4688, 106508 for the “2016-07- 21/00_00_pokemon_ios.html” file. 
# There are 2 values from iOS pages.

'\nQ1. From all the iOS pages (ending with “_ios.html”) in the Data folder, extract \n    (1) number of customer ratings in the Current Version (let’s call it ios_current_ratings); and\n    (2) number of customer ratings in All Versions (ios_all_ratings). \n'

In [2]:
import os
from bs4 import BeautifulSoup
import pandas as pd

data_folder = ["Data/2016-07-21", "Data/2016-07-22"]

file_names = []
current_ratings = []
all_ratings = []

for folder in data_folder:
    for file_name in os.listdir(folder):
        
        if file_name.endswith("_ios.html"):
            file_path = os.path.join(folder, file_name)
            
            with open(file_path, "r", encoding="utf-8") as file:
                soup = BeautifulSoup(file, "html.parser")
                
                current_version_div = soup.find("div", text="Current Version:")
                if current_version_div:
                    current_ratings_div = current_version_div.find_next("span", class_="rating-count")
                    ios_current_value = int(current_ratings_div.text.split()[0].replace(",", "")) if current_ratings_div else None
                else:
                    ios_current_value = None
                
                all_versions_div = soup.find("div", text="All Versions:")
                if all_versions_div:
                    all_ratings_div = all_versions_div.find_next("span", class_="rating-count")
                    ios_all_value = int(all_ratings_div.text.split()[0].replace(",", "")) if all_ratings_div else None
                else:
                    ios_all_value = None
                
                file_names.append(folder + file_name)
                current_ratings.append(ios_current_value)
                all_ratings.append(ios_all_value)

data = pd.DataFrame({
    "File Name": file_names,
    "iOS Current Ratings": current_ratings,
    "iOS All Ratings": all_ratings
})

print(data)


  current_version_div = soup.find("div", text="Current Version:")
  all_versions_div = soup.find("div", text="All Versions:")


                                 File Name  iOS Current Ratings  \
0    Data/2016-07-2100_00_pokemon_ios.html                 4688   
1    Data/2016-07-2100_10_pokemon_ios.html                 4688   
2    Data/2016-07-2100_20_pokemon_ios.html                 4688   
3    Data/2016-07-2100_30_pokemon_ios.html                 4688   
4    Data/2016-07-2100_40_pokemon_ios.html                 4688   
..                                     ...                  ...   
283  Data/2016-07-2223_10_pokemon_ios.html                 9686   
284  Data/2016-07-2223_20_pokemon_ios.html                10132   
285  Data/2016-07-2223_30_pokemon_ios.html                10132   
286  Data/2016-07-2223_40_pokemon_ios.html                10132   
287  Data/2016-07-2223_50_pokemon_ios.html                10132   

     iOS All Ratings  
0             106508  
1             106508  
2             106508  
3             106508  
4             106508  
..               ...  
283           111547  
284        

In [3]:
'''
Q2. From all the Android pages (ending with “_android.html”) in the Data folder, extract 
    (1) average rating (in the scale between 1.0 and 5.0) (android_avg_rating);and
    (2) number of total ratings (android_total_ratings).
'''
    
# For example, the extracted values should be: 3.9, 1281802 for the “2016-07- 21/00_00_pokemon_android.html” file. 
# There are 2 values from Android pages.

'\nQ2. From all the Android pages (ending with “_android.html”) in the Data folder, extract \n    (1) average rating (in the scale between 1.0 and 5.0) (android_avg_rating);and\n    (2) number of total ratings (android_total_ratings).\n'

In [4]:
import os
from bs4 import BeautifulSoup
import pandas as pd

data_folder = ["Data/2016-07-21", "Data/2016-07-22"]

file_names = []
android_avg_ratings = []
android_total_ratings = []

for folder in data_folder:
    for file_name in os.listdir(folder):
        if file_name.endswith("_android.html"):
            file_path = os.path.join(folder, file_name)
            
            # parse the HTML file
            with open(file_path, "r", encoding="utf-8") as file:
                
                soup = BeautifulSoup(file, "html.parser")
                
                # android_avg_rating
                avg_rating_div = soup.find("div", class_="score")
                android_avg_value = float(avg_rating_div.text.strip())
                
                # android_total_ratings
                total_ratings_div = soup.find("span", class_="reviews-num")
                android_total_value = int(total_ratings_div.text.replace(",", "").strip())
                
                # results
                file_names.append(folder + "/" + file_name)
                android_avg_ratings.append(android_avg_value)
                android_total_ratings.append(android_total_value)

data = pd.DataFrame({
    "File Name": file_names,
    "Android Avg Rating": android_avg_ratings,
    "Android Total Ratings": android_total_ratings
})

print(data)


                                      File Name  Android Avg Rating  \
0    Data/2016-07-21/00_00_pokemon_android.html                 3.9   
1    Data/2016-07-21/00_10_pokemon_android.html                 3.9   
2    Data/2016-07-21/00_20_pokemon_android.html                 3.9   
3    Data/2016-07-21/00_30_pokemon_android.html                 3.9   
4    Data/2016-07-21/00_40_pokemon_android.html                 3.9   
..                                          ...                 ...   
283  Data/2016-07-22/23_10_pokemon_android.html                 3.9   
284  Data/2016-07-22/23_20_pokemon_android.html                 3.9   
285  Data/2016-07-22/23_30_pokemon_android.html                 3.9   
286  Data/2016-07-22/23_40_pokemon_android.html                 3.9   
287  Data/2016-07-22/23_50_pokemon_android.html                 3.9   

     Android Total Ratings  
0                  1281802  
1                  1281802  
2                  1281802  
3                  1281802  
4 

# Data Organization

In [5]:
'''
Q3. Using the extracted values from the previous step, create a Python dictionary, 
    where the key is datetime object and 
    the value is a dictionary with extracted values from iOS and Android HTML files. 

'''
    
# For example, for the case of “2016- 07-21-00_00_pokemon_android.html” file and
# “2016-07- 21/00_00_pokemon_ios.html” file, 
# the key should be 
# datetime(2016, 7, 21, 0, 0, 0)
# and the value should be: 
# {‘ios_current_ratings’: 4688, ‘ios_all_ratings’: 106508, ‘android_avg_rating’: 3.9, ‘android_total_ratings’: 1281802}

'\nQ3. Using the extracted values from the previous step, create a Python dictionary, \n    where the key is datetime object and \n    the value is a dictionary with extracted values from iOS and Android HTML files. \n\n'

In [6]:
from datetime import datetime
import os
from bs4 import BeautifulSoup
import pandas as pd

data_folder = ["Data/2016-07-21", "Data/2016-07-22"]
data_dict = {}

# iOS HTML file
def parse_ios_html(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        
        # ios_current_ratings
        current_version_div = soup.find("div", text="Current Version:")
        current_ratings_div = current_version_div.find_next("span", class_="rating-count") if current_version_div else None
        ios_current_value = int(current_ratings_div.text.split()[0].replace(",", "")) if current_ratings_div else None
        
        # ios_all_ratings
        all_versions_div = soup.find("div", text="All Versions:")
        all_ratings_div = all_versions_div.find_next("span", class_="rating-count") if all_versions_div else None
        ios_all_value = int(all_ratings_div.text.split()[0].replace(",", "")) if all_ratings_div else None
        
        return {"ios_current_ratings": ios_current_value, "ios_all_ratings": ios_all_value}

# Android HTML file
def parse_android_html(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        
        # android_avg_rating
        avg_rating_div = soup.find("div", class_="score")
        android_avg_value = float(avg_rating_div.text.strip()) if avg_rating_div else None
        
        # android_total_ratings
        total_ratings_div = soup.find("span", class_="reviews-num")
        android_total_value = int(total_ratings_div.text.replace(",", "").strip()) if total_ratings_div else None
        
        return {"android_current_rating": android_avg_value, "android_all_ratings": android_total_value}

# Main loop to process files
for folder in data_folder:
    folder_date = os.path.basename(folder)
    for file_name in os.listdir(folder):
        if file_name.endswith("_ios.html") or file_name.endswith("_android.html"):
            time_str = file_name.split("_")[0:2]
            time_str = "-".join(time_str)
            
            date_time_str = f"{folder_date}-{time_str}"
            try:
                dt_obj = datetime.strptime(date_time_str, "%Y-%m-%d-%H-%M")
            except ValueError as e:
                print(f"Error parsing datetime from filename {file_name}: {e}")
                continue
            
            file_path = os.path.join(folder, file_name)
            
            if file_name.endswith("_ios.html"):
                if dt_obj not in data_dict:
                    data_dict[dt_obj] = {}
                data_dict[dt_obj].update(parse_ios_html(file_path))
            elif file_name.endswith("_android.html"):
                if dt_obj not in data_dict:
                    data_dict[dt_obj] = {}
                data_dict[dt_obj].update(parse_android_html(file_path))


print(data_dict)
data_dict[datetime(2016, 7, 21, 0, 0, 0)]

  current_version_div = soup.find("div", text="Current Version:")
  all_versions_div = soup.find("div", text="All Versions:")


{datetime.datetime(2016, 7, 21, 0, 0): {'android_current_rating': 3.9, 'android_all_ratings': 1281802, 'ios_current_ratings': 4688, 'ios_all_ratings': 106508}, datetime.datetime(2016, 7, 21, 0, 10): {'android_current_rating': 3.9, 'android_all_ratings': 1281802, 'ios_current_ratings': 4688, 'ios_all_ratings': 106508}, datetime.datetime(2016, 7, 21, 0, 20): {'android_current_rating': 3.9, 'android_all_ratings': 1281802, 'ios_current_ratings': 4688, 'ios_all_ratings': 106508}, datetime.datetime(2016, 7, 21, 0, 30): {'android_current_rating': 3.9, 'android_all_ratings': 1281802, 'ios_current_ratings': 4688, 'ios_all_ratings': 106508}, datetime.datetime(2016, 7, 21, 0, 40): {'android_current_rating': 3.9, 'android_all_ratings': 1281802, 'ios_current_ratings': 4688, 'ios_all_ratings': 106508}, datetime.datetime(2016, 7, 21, 0, 50): {'android_current_rating': 3.9, 'android_all_ratings': 1281802, 'ios_current_ratings': 4688, 'ios_all_ratings': 106508}, datetime.datetime(2016, 7, 21, 1, 0): {'

{'android_current_rating': 3.9,
 'android_all_ratings': 1281802,
 'ios_current_ratings': 4688,
 'ios_all_ratings': 106508}

In [7]:
'''
Q4. Convert the dictionary into a Pandas dataframe, pokemon_db, 
    where the index is datetime and columns are names of the extracted iOS/Android values. 

'''

'\nQ4. Convert the dictionary into a Pandas dataframe, pokemon_db, \n    where the index is datetime and columns are names of the extracted iOS/Android values. \n\n'

In [8]:

data_df = pd.DataFrame.from_dict(data_dict, orient="index")
data_df.reset_index(inplace=True)
data_df.rename(columns={"index": "datetime"}, inplace=True)
data_df.set_index("datetime", inplace=True)
data_df

Unnamed: 0_level_0,android_current_rating,android_all_ratings,ios_current_ratings,ios_all_ratings
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-07-21 00:00:00,3.9,1281802,4688,106508
2016-07-21 00:10:00,3.9,1281802,4688,106508
2016-07-21 00:20:00,3.9,1281802,4688,106508
2016-07-21 00:30:00,3.9,1281802,4688,106508
2016-07-21 00:40:00,3.9,1281802,4688,106508
...,...,...,...,...
2016-07-22 23:10:00,3.9,1424989,9686,111547
2016-07-22 23:20:00,3.9,1424989,10132,111140
2016-07-22 23:30:00,3.9,1424989,10132,111547
2016-07-22 23:40:00,3.9,1424989,10132,111547


In [9]:
'''
Q5. Save the dataframe into two formats (CSV and Excel). The file names should be HW9.csv and HW9.xlsx.

'''

'\nQ5. Save the dataframe into two formats (CSV and Excel). The file names should be HW9.csv and HW9.xlsx.\n\n'

In [10]:
# CSV
data_df.to_csv("HW9.csv", index=True) 
# Excel
data_df.to_excel("HW9.xlsx", index=True)
