# 1) [Web Scrapping: 50 points] The first step is to extract various values from the raw HTML files. You can use BecutifulSoup or other Python modules.
### a. From all the iOS pages (ending with “_ios.html”), extract (i) number of customer ratings in the Current Version (let’s call it ios_current_ratings); and (ii) number of customer ratings in All Versions (ios_all_ratings). For example, the extracted values should be: 4688, 106508 for the “2016-07- 21/00_00_pokemon_ios.html” file. There are 2 values from iOS pages.

In [1]:
import os
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
os.listdir('pokemon_data')

['.DS_Store',
 '2016-07-28',
 '2016-07-21',
 '2016-07-26',
 '2016-07-27',
 '2016-07-29',
 '2016-07-30',
 '2016-07-31',
 '2016-07-25',
 '2016-07-22',
 '2016-07-23',
 '2016-07-24']

In [2]:
ios_data = {}
base_dir = 'pokemon_data'

for folder in sorted(os.listdir(base_dir)):
    if folder.startswith('.'):
        continue

    folder_path = os.path.join(base_dir, folder)
    if not os.path.isdir(folder_path):
        continue

    for file in sorted(os.listdir(folder_path)):
        if file.endswith('_ios.html'): 
            file_path = os.path.join(folder_path, file)

            with open(file_path, 'r', encoding='utf8', errors='ignore') as infile:
                contents = infile.read()
                soup = BeautifulSoup(contents, 'html.parser')

                rating_counts = soup.find_all('span', {'class': 'rating-count'})
                if len(rating_counts) == 2:
                    ios_current_ratings = int(rating_counts[0].get_text(strip=True).split()[0].replace(',', ''))
                    ios_all_ratings = int(rating_counts[1].get_text(strip=True).split()[0].replace(',', ''))
                else:
                    ios_current_ratings, ios_all_ratings = None, None

                ios_data[file] = {
                    'ios_current_ratings': ios_current_ratings,
                    'ios_all_ratings': ios_all_ratings
                }

for file, data in ios_data.items():
    print(f"File: {file}")
    for key, value in data.items():
        print(f"  {key}: {value}")

File: 00_00_pokemon_ios.html
  ios_current_ratings: 1261
  ios_all_ratings: 123267
File: 00_10_pokemon_ios.html
  ios_current_ratings: 1261
  ios_all_ratings: 123267
File: 00_20_pokemon_ios.html
  ios_current_ratings: 1261
  ios_all_ratings: 123267
File: 00_30_pokemon_ios.html
  ios_current_ratings: 1261
  ios_all_ratings: 123267
File: 00_40_pokemon_ios.html
  ios_current_ratings: 1261
  ios_all_ratings: 123267
File: 00_50_pokemon_ios.html
  ios_current_ratings: 1261
  ios_all_ratings: 123267
File: 01_00_pokemon_ios.html
  ios_current_ratings: 1261
  ios_all_ratings: 123267
File: 01_10_pokemon_ios.html
  ios_current_ratings: 1261
  ios_all_ratings: 123267
File: 01_20_pokemon_ios.html
  ios_current_ratings: 1261
  ios_all_ratings: 123267
File: 01_30_pokemon_ios.html
  ios_current_ratings: 1261
  ios_all_ratings: 123267
File: 01_40_pokemon_ios.html
  ios_current_ratings: 1261
  ios_all_ratings: 123267
File: 01_50_pokemon_ios.html
  ios_current_ratings: 1261
  ios_all_ratings: 123267
File

### b. From all the Android pages (ending with “_android.html”), extract (i) average rating (in the scale between 1.0 and 5.0) (android_avg_rating); (ii) number of total ratings (android_total_ratings); and (iii) number of ratings for 1-5 stars (android_ratings_1, android_ratings_2, …, android_ratings_5). For example, the extracted values should be: 3.9, 1281802, 199974, 71512, 117754, 165956, 726597 for the “2016-07-21/00_00_pokemon_android.html” file. There are 7 values from Android pages.

In [3]:
android_data = {}

base_dir = 'pokemon_data'

for folder in sorted(os.listdir(base_dir)):
    if folder.startswith('.'):
        continue

    folder_path = os.path.join(base_dir, folder)
    if not os.path.isdir(folder_path):
        continue

    for file in sorted(os.listdir(folder_path)):
        if file.endswith('_android.html'): 
            file_path = os.path.join(folder_path, file)

            with open(file_path, 'r', encoding='utf8', errors='ignore') as infile:
                contents = infile.read()
                soup = BeautifulSoup(contents, 'html.parser')

                avg_rating_element = soup.find('div', class_='score')
                android_avg_rating = float(avg_rating_element.get_text(strip=True)) if avg_rating_element else None

                total_ratings_element = soup.find('span', class_='reviews-num')
                android_total_ratings = int(total_ratings_element.get_text(strip=True).replace(',', '')) if total_ratings_element else None

                bar_numbers = soup.find_all('span', {'class': 'bar-number'})
                android_ratings = []
                for i in range(5):
                    if i < len(bar_numbers):
                        android_ratings.append(int(bar_numbers[i].get_text(strip=True).replace(',', '')))
                    else:
                        android_ratings.append(None)

                android_data[file] = {
                    'android_avg_rating': android_avg_rating,
                    'android_total_ratings': android_total_ratings,
                    'android_ratings_5': android_ratings[4] if len(android_ratings) >= 5 else None,
                    'android_ratings_4': android_ratings[3] if len(android_ratings) >= 4 else None,
                    'android_ratings_3': android_ratings[2] if len(android_ratings) >= 3 else None,
                    'android_ratings_2': android_ratings[1] if len(android_ratings) >= 2 else None,
                    'android_ratings_1': android_ratings[0] if len(android_ratings) >= 1 else None,
                }

for file, data in android_data.items():
    print(f"File: {file}")
    for key, value in data.items():
        print(f"  {key}: {value}")

File: 00_00_pokemon_android.html
  android_avg_rating: 4.0
  android_total_ratings: 1852512
  android_ratings_5: 267332
  android_ratings_4: 95986
  android_ratings_3: 168615
  android_ratings_2: 248481
  android_ratings_1: 1072098
File: 00_10_pokemon_android.html
  android_avg_rating: 4.0
  android_total_ratings: 1852512
  android_ratings_5: 267332
  android_ratings_4: 95986
  android_ratings_3: 168615
  android_ratings_2: 248481
  android_ratings_1: 1072098
File: 00_20_pokemon_android.html
  android_avg_rating: 4.0
  android_total_ratings: 1852512
  android_ratings_5: 267332
  android_ratings_4: 95986
  android_ratings_3: 168615
  android_ratings_2: 248481
  android_ratings_1: 1072098
File: 00_30_pokemon_android.html
  android_avg_rating: 4.0
  android_total_ratings: 1852512
  android_ratings_5: 267332
  android_ratings_4: 95986
  android_ratings_3: 168615
  android_ratings_2: 248481
  android_ratings_1: 1072098
File: 00_40_pokemon_android.html
  android_avg_rating: 4.0
  android_tot

# 2) [Data Organization: 50 points] The next step is to organize the extracted values, so that we can do some data exploration. As we have time series data, we will organize the data by datetime (note that datetime is a Python data type).
### a. Using the extracted values from the previous step, create a Python dictionary, where the key is datetime object and the value is a dictionary with extracted values from iOS and Android HTML files. For example, for the case of “2016-07-21-00_00_pokemon_android.html” file and “2016-07-21/00_00_pokemon_ios.html” file, the key should be datetime(2016, 7, 21, 0, 0, 0) and the value should be {‘ios_current_ratings’: 4688, ‘ios_all_ratings’: 106508, ‘android_avg_rating’: 3.9, ‘android_total_ratings’: 1281802, ‘android_rating_1’: 199974, ‘android_rating_2’: 71512, ‘android_rating_3’: 117754, ‘android_rating_4’: 165956, ‘android_rating_5’: 726597}

In [4]:
time_series_data = {}

base_dir = 'pokemon_data'

for folder in sorted(os.listdir(base_dir)):
    if folder.startswith('.'):
        continue

    folder_path = os.path.join(base_dir, folder)
    if not os.path.isdir(folder_path):
        continue

    for file in sorted(os.listdir(folder_path)):
        if file.startswith('.'):
            continue

        try:
            YMD = folder.split('-')
            HMS = file.split('_')
            key = datetime(int(YMD[0]), int(YMD[1]), int(YMD[2]), int(HMS[0]), int(HMS[1]), 0)
        except ValueError:
            continue  
            
        if key not in time_series_data:
            time_series_data[key] = {}

        file_path = os.path.join(folder_path, file)

        if file.endswith('_ios.html'):
            with open(file_path, 'r', encoding='utf8', errors='ignore') as infile:
                contents = infile.read()
                soup = BeautifulSoup(contents, 'html.parser')

                rating_counts = soup.find_all('span', {'class': 'rating-count'})
                if len(rating_counts) == 2:
                    ios_current_ratings = int(rating_counts[0].get_text(strip=True).split()[0].replace(',', ''))
                    ios_all_ratings = int(rating_counts[1].get_text(strip=True).split()[0].replace(',', ''))
                else:
                    ios_current_ratings, ios_all_ratings = None, None

                time_series_data[key]['ios_current_ratings'] = ios_current_ratings
                time_series_data[key]['ios_all_ratings'] = ios_all_ratings

        elif file.endswith('_android.html'):
            with open(file_path, 'r', encoding='utf8', errors='ignore') as infile:
                contents = infile.read()
                soup = BeautifulSoup(contents, 'html.parser')

                avg_rating_element = soup.find('div', class_='score')
                android_avg_rating = float(avg_rating_element.get_text(strip=True)) if avg_rating_element else None

                total_ratings_element = soup.find('span', class_='reviews-num')
                android_total_ratings = int(total_ratings_element.get_text(strip=True).replace(',', '')) if total_ratings_element else None

                bar_numbers = soup.find_all('span', {'class': 'bar-number'})
                android_ratings = []
                for i in range(5): 
                    if i < len(bar_numbers):
                        android_ratings.append(int(bar_numbers[i].get_text(strip=True).replace(',', '')))
                    else:
                        android_ratings.append(None)

                time_series_data[key]['android_avg_rating'] = android_avg_rating
                time_series_data[key]['android_total_ratings'] = android_total_ratings
                time_series_data[key]['android_rating_5'] = android_ratings[4] if len(android_ratings) >= 5 else None
                time_series_data[key]['android_rating_4'] = android_ratings[3] if len(android_ratings) >= 4 else None
                time_series_data[key]['android_rating_3'] = android_ratings[2] if len(android_ratings) >= 3 else None
                time_series_data[key]['android_rating_2'] = android_ratings[1] if len(android_ratings) >= 2 else None
                time_series_data[key]['android_rating_1'] = android_ratings[0] if len(android_ratings) >= 1 else None

for key, value in time_series_data.items():
    print(f"Datetime: {key}")
    for k, v in value.items():
        print(f"  {k}: {v}")

Datetime: 2016-07-21 00:00:00
  android_avg_rating: 3.9
  android_total_ratings: 1281802
  android_rating_5: 199974
  android_rating_4: 71521
  android_rating_3: 117754
  android_rating_2: 165956
  android_rating_1: 726597
  ios_current_ratings: 4688
  ios_all_ratings: 106508
Datetime: 2016-07-21 00:10:00
  android_avg_rating: 3.9
  android_total_ratings: 1281802
  android_rating_5: 199974
  android_rating_4: 71521
  android_rating_3: 117754
  android_rating_2: 165956
  android_rating_1: 726597
  ios_current_ratings: 4688
  ios_all_ratings: 106508
Datetime: 2016-07-21 00:20:00
  android_avg_rating: 3.9
  android_total_ratings: 1281802
  android_rating_5: 199974
  android_rating_4: 71521
  android_rating_3: 117754
  android_rating_2: 165956
  android_rating_1: 726597
  ios_current_ratings: 4688
  ios_all_ratings: 106508
Datetime: 2016-07-21 00:30:00
  android_avg_rating: 3.9
  android_total_ratings: 1281802
  android_rating_5: 199974
  android_rating_4: 71521
  android_rating_3: 117754


### b. Convert the dictionary into a Pandas dataframe, pokemon_db, where the index is datetime and columns are names of the extracted 9 iOS/Android values.

In [5]:
pokemon_db = pd.DataFrame.from_dict(time_series_data, orient='index')

In [6]:
pokemon_db.index.name = 'datetime'

### c. Save the dataframe into two formats (CSV and Excel). The file names should be pokemon.csv and pokemon.xlsx.

In [7]:
pokemon_db.to_csv('pokemon.csv')

In [8]:
pokemon_db.to_excel('pokemon.xlsx', index=True)

In [9]:
print("DataFrame saved to 'pokemon.csv' and 'pokemon.xlsx'.")

DataFrame saved to 'pokemon.csv' and 'pokemon.xlsx'.


# References used in this analysis

In [10]:
# BeautifulSoup Documentation. (n.d.). Obtained from https://www.crummy.com/software/BeautifulSoup/bs4/doc/
# Pandas Documentation. (n.d.). Obtained from https://pandas.pydata.org/docs/
# Python Documentation. (n.d.). Obtained from https://docs.python.org/3/

** I decided to do the analysis in one cell since it was easy for me to follow up the coding path.

SyntaxError: invalid syntax (102479085.py, line 5)