# Pokémon Analysis Final Project

#### Author: Saransh Rakshak | Course: DSA-8640 | Due: Dec. 11, 2024

In [3]:
# import libraries
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime

In [4]:
data_path = "pokemon_data/"

data_folders = [
    "2016-07-21",
    "2016-07-22",
    "2016-07-23",
    "2016-07-24",
    "2016-07-25",
    "2016-07-26",
    "2016-07-27",
    "2016-07-28",
    "2016-07-29",
    "2016-07-30",
    "2016-07-31"
    ]

## Part 1: Web Scrapping

The first step is to extract various values from the raw HTML files. You can use BeautifulSoup or other Python modules.

# 🅐 
> From all the iOS pages (ending with “_ios.html”), extract (i) number of customer ratings in the Current Version (let’s call it *ios_current_ratings*); and (ii) number of customer ratings in All Versions (*ios_all_ratings*). For example, the extracted values should be: 4688, 106508 for the *“2016-07-21/00_00_pokemon_ios.html”* file. There are 2 values from iOS pages.

In [6]:
file_names = []
current_ratings = []
all_ratings = []

for folder in data_folders:
    for file_name in os.listdir(data_path + folder):
        if file_name.endswith("_ios.html"):
            file_path = os.path.join(data_path, folder, file_name)
            
            # Parse HTML
            with open(file_path, "r", encoding="utf-8") as file:
                # file_name
                soup = BeautifulSoup(file, "html.parser")
                file_names.append(folder + "/" + file_name)
                
                # ios_current_ratings
                curr_ratings_div = soup.find("div", text="Current Version:")
                current_ratings_div = curr_ratings_div.find_next("span", class_="rating-count") if curr_ratings_div else None
                ios_current_value = int(current_ratings_div.text.split()[0].replace(",", "")) if current_ratings_div else None
                current_ratings.append(ios_current_value)
                
                # ios_all_ratings
                all_ratings_div = soup.find("div", text="All Versions:").find_next("span", class_="rating-count")
                if all_ratings_div:
                    all_ratings.append(int(all_ratings_div.text.split()[0].replace(",", "")))
                else:
                    all_ratings.append(None)

ios_data = pd.DataFrame(data = {
    "ios_current_ratings" : current_ratings,
    "ios_all_ratings" : all_ratings
    }, index = file_names)

  curr_ratings_div = soup.find("div", text="Current Version:")
  all_ratings_div = soup.find("div", text="All Versions:").find_next("span", class_="rating-count")


In [7]:
ios_data

Unnamed: 0,ios_current_ratings,ios_all_ratings
2016-07-21/00_00_pokemon_ios.html,4688.0,106508
2016-07-21/00_10_pokemon_ios.html,4688.0,106508
2016-07-21/00_20_pokemon_ios.html,4688.0,106508
2016-07-21/00_30_pokemon_ios.html,4688.0,106508
2016-07-21/00_40_pokemon_ios.html,4688.0,106508
...,...,...
2016-07-31/23_10_pokemon_ios.html,17856.0,139213
2016-07-31/23_20_pokemon_ios.html,22193.0,143350
2016-07-31/23_30_pokemon_ios.html,22193.0,143350
2016-07-31/23_40_pokemon_ios.html,22193.0,143350


In [8]:
ios_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1584 entries, 2016-07-21/00_00_pokemon_ios.html to 2016-07-31/23_50_pokemon_ios.html
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ios_current_ratings  1552 non-null   float64
 1   ios_all_ratings      1584 non-null   int64  
dtypes: float64(1), int64(1)
memory usage: 37.1+ KB


In [9]:
# Comparing with expected answer
ios_test_expected = [4688, 106508]
print(all(ios_data.loc['2016-07-21/00_00_pokemon_ios.html'].values == ios_test_expected))

True


# 🅑
> From all the Android pages (ending with “_android.html”), extract (i) average rating (in the scale between 1.0 and 5.0) (android_avg_rating); (ii) number of total ratings (android_total_ratings); and (iii) number of ratings for 1-5 stars (android_ratings_1, android_ratings_2, …, android_ratings_5). For example, the extracted values should be: 3.9, 1281802, 199974, 71521, 117754, 165956, 726597 for the “2016-07-21/00_00_pokemon_android.html” file. There are 7 values from Android pages.

In [11]:
file_names = []
android_avg_ratings = []
android_total_ratings = []
android_star_ratings = {'one' : [], 'two' : [], 'three' : [], 'four' : [], 'five' : []}

for folder in data_folders:
    for file_name in os.listdir(data_path + folder):
        if file_name.endswith("_android.html"):
            file_path = os.path.join(data_path, folder, file_name)
            
            # Parse HTML
            with open(file_path, "r", encoding="utf-8") as file:
                # file_name
                soup = BeautifulSoup(file, "html.parser")
                file_names.append(folder + "/" + file_name)
                
                # android_avg_rating
                avg_rating_div = soup.find("div", class_="score")
                android_avg_ratings.append(float(avg_rating_div.text.strip()))
                
                # android_total_ratings
                total_ratings_div = soup.find("span", class_="reviews-num")
                android_total_ratings.append(int(total_ratings_div.text.replace(",", "").strip()))
                
                # android_ratings_[1 through 5]
                for star_val in list(android_star_ratings.keys()):
                    star_rating_span = soup.find("div", class_=f"rating-bar-container {star_val}").find_next("span", class_="bar-number")
                    if star_rating_span:
                        android_star_ratings[star_val].append(int(star_rating_span.text.replace(",", "").strip()))
                    else:
                        android_star_ratings[star_val].append(0)

android_data = pd.DataFrame(data = {
                                "android_avg_rating": android_avg_ratings,
                                "android_total_ratings": android_total_ratings,
                                "android_ratings_1": android_star_ratings['one'],
                                "android_ratings_2": android_star_ratings['two'],
                                "android_ratings_3": android_star_ratings['three'],
                                "android_ratings_4": android_star_ratings['four'],
                                "android_ratings_5": android_star_ratings['five']
                                }, index = file_names)

In [12]:
android_data

Unnamed: 0,android_avg_rating,android_total_ratings,android_ratings_1,android_ratings_2,android_ratings_3,android_ratings_4,android_ratings_5
2016-07-21/00_00_pokemon_android.html,3.9,1281802,199974,71521,117754,165956,726597
2016-07-21/00_10_pokemon_android.html,3.9,1281802,199974,71521,117754,165956,726597
2016-07-21/00_20_pokemon_android.html,3.9,1281802,199974,71521,117754,165956,726597
2016-07-21/00_30_pokemon_android.html,3.9,1281802,199974,71521,117754,165956,726597
2016-07-21/00_40_pokemon_android.html,3.9,1281802,199974,71521,117754,165956,726597
...,...,...,...,...,...,...,...
2016-07-31/23_10_pokemon_android.html,3.9,1954991,302864,101244,173651,259919,1117313
2016-07-31/23_20_pokemon_android.html,3.9,1954991,302864,101244,173651,259919,1117313
2016-07-31/23_30_pokemon_android.html,3.9,1954991,302864,101244,173651,259919,1117313
2016-07-31/23_40_pokemon_android.html,3.9,1954991,302864,101244,173651,259919,1117313


In [13]:
android_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1584 entries, 2016-07-21/00_00_pokemon_android.html to 2016-07-31/23_50_pokemon_android.html
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   android_avg_rating     1584 non-null   float64
 1   android_total_ratings  1584 non-null   int64  
 2   android_ratings_1      1584 non-null   int64  
 3   android_ratings_2      1584 non-null   int64  
 4   android_ratings_3      1584 non-null   int64  
 5   android_ratings_4      1584 non-null   int64  
 6   android_ratings_5      1584 non-null   int64  
dtypes: float64(1), int64(6)
memory usage: 99.0+ KB


In [14]:
# Comparing with expected answer
android_test_expected = [3.9, 1281802, 199974, 71521, 117754, 165956, 726597]
print(all(android_data.loc['2016-07-21/00_00_pokemon_android.html'].values == android_test_expected))

True


## Part 2: Data Organization

The next step is to organize the extracted values, so that we can do some data exploration. As we have time series data, we will organize the data by *datetime* (note that *datetime* is a Python data type).

# 🅐
> Using the extracted values from the previous step, create a Python dictionary, where the key is *datetime* object and the value is a dictionary with extracted values from iOS and Android HTML files. For example, for the case of “*2016-07-21-00_00_pokemon_android.html*” file and “*2016-07-21/00_00_pokemon_ios.html*” file, the key should be *datetime(2016, 7, 21, 0, 0, 0)* and the value should be: { *‘ios_current_ratings’* : *4688*, *‘ios_all_ratings’* : *106508*, *‘android_avg_rating’* : *3.9*, *‘android_total_ratings’* : *1281802*, *‘android_rating_1’* : *199974*, *‘android_rating_2’* : *71512*, *‘android_rating_3’* : *117754*, *‘android_rating_4’* : *165956*, *‘android_rating_5’* : *726597* }

In [16]:
# iOS HTML file
def parse_ios_html(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        
        # ios_current_ratings
        curr_ratings_div = soup.find("div", text="Current Version:")
        curr_ratings_span = curr_ratings_div.find_next("span", class_="rating-count") if curr_ratings_div else None
        ios_current_ratings = int(curr_ratings_span.text.split()[0].replace(",", "")) if curr_ratings_span else None
        
        # ios_all_ratings
        all_ratings_div = soup.find("div", text="All Versions:")
        all_ratings_span = all_ratings_div.find_next("span", class_="rating-count")
        ios_all_ratings = int(all_ratings_span.text.split()[0].replace(",", "")) if all_ratings_span else None
        
        return {"ios_current_ratings": ios_current_ratings, "ios_all_ratings": ios_all_ratings}

# Android HTML file
def parse_android_html(file_path):
    android_star_ratings = {'one' : [], 'two' : [], 'three' : [], 'four' : [], 'five' : []}
    
    # Parse HTML
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        
        # android_avg_rating
        avg_rating_div = soup.find("div", class_="score")
        android_avg_rating = float(avg_rating_div.text.strip())
        
        # android_total_ratings
        total_ratings_div = soup.find("span", class_="reviews-num")
        android_total_ratings = int(total_ratings_div.text.replace(",", "").strip())
        
        # android_ratings_[1 through 5]
        for star_val in list(android_star_ratings.keys()):
            star_rating_span = soup.find("div", class_=f"rating-bar-container {star_val}").find_next("span", class_="bar-number")
            if star_rating_span:
                android_star_ratings[star_val].append(int(star_rating_span.text.replace(",", "").strip()))
            else:
                android_star_ratings[star_val].append(0)
        
        return {"android_avg_rating" : android_avg_rating,
                "android_total_ratings" : android_total_ratings,
                "android_ratings_1" : android_star_ratings["one"][0],
                "android_ratings_2" : android_star_ratings["two"][0],
                "android_ratings_3" : android_star_ratings["three"][0],
                "android_ratings_4" : android_star_ratings["four"][0],
                "android_ratings_5" : android_star_ratings["five"][0]}

In [17]:
datetime_dict = {}

# Main loop to process files
for folder in data_folders:
    folder_date = os.path.basename(folder)
    
    for file_name in os.listdir(data_path + folder):
        if file_name.endswith("_ios.html") or file_name.endswith("_android.html"):
            time_str = file_name.split("_")[0:2]
            time_str = "-".join(time_str)
            
            date_time_str = f"{folder_date}-{time_str}"
            try:
                dt_obj = datetime.strptime(date_time_str, "%Y-%m-%d-%H-%M")
            except ValueError as e:
                print(f"Error parsing datetime from filename {file_name}: {e}")
                continue
            
            file_path = os.path.join(data_path, folder, file_name)
            
            if file_name.endswith("_ios.html"):
                if dt_obj not in datetime_dict:
                    datetime_dict[dt_obj] = {}
                datetime_dict[dt_obj].update(parse_ios_html(file_path))
            elif file_name.endswith("_android.html"):
                if dt_obj not in datetime_dict:
                    datetime_dict[dt_obj] = {}
                datetime_dict[dt_obj].update(parse_android_html(file_path))

  curr_ratings_div = soup.find("div", text="Current Version:")
  all_ratings_div = soup.find("div", text="All Versions:")


In [18]:
# Comparing with expected answer
key = datetime(2016, 7, 21, 0, 0, 0)
checks = [
    datetime_dict[key]["ios_current_ratings"] == 4688,
    datetime_dict[key]["ios_all_ratings"] == 106508,
    datetime_dict[key]["android_avg_rating"] == 3.9,
    datetime_dict[key]["android_total_ratings"] == 1281802,
    datetime_dict[key]["android_ratings_1"] == 199974,
    datetime_dict[key]["android_ratings_2"] == 71521,
    datetime_dict[key]["android_ratings_3"] == 117754,
    datetime_dict[key]["android_ratings_4"] == 165956,
    datetime_dict[key]["android_ratings_5"] == 726597,
]

# assert keys are of datetime type & check column values 
print(all(isinstance(key, datetime) for key in datetime_dict.keys())) and print(all(checks))

True


# 🅑
> Convert the dictionary into a Pandas *dataframe*, *pokemon_db*, where the index is *datetime* and columns are names of the extracted 9 iOS/Android values.

In [20]:
datetime_df = pd.DataFrame.from_dict(datetime_dict, orient = "index")
datetime_df.head()

Unnamed: 0,android_avg_rating,android_total_ratings,android_ratings_1,android_ratings_2,android_ratings_3,android_ratings_4,android_ratings_5,ios_current_ratings,ios_all_ratings
2016-07-21 00:00:00,3.9,1281802,199974,71521,117754,165956,726597,4688.0,106508
2016-07-21 00:10:00,3.9,1281802,199974,71521,117754,165956,726597,4688.0,106508
2016-07-21 00:20:00,3.9,1281802,199974,71521,117754,165956,726597,4688.0,106508
2016-07-21 00:30:00,3.9,1281802,199974,71521,117754,165956,726597,4688.0,106508
2016-07-21 00:40:00,3.9,1281802,199974,71521,117754,165956,726597,4688.0,106508


In [21]:
# Comparing with expected answer
isinstance(datetime_df, pd.DataFrame) and isinstance(datetime_df.index, pd.DatetimeIndex) and list(datetime_df.columns) == list(android_data.columns) + list(ios_data.columns)

True

# 🅒
> Save the dataframe into two formats (CSV and Excel). The file names should be *pokemon.csv* and *pokemon.xlsx*.

In [23]:
datetime_df.to_csv("pokemon.csv")
datetime_df.to_excel("pokemon.xlsx")