### Initial Set up

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import csv
import json
import re
from os import path


# Remove dataFrame display size restrictions
pd.set_option("display.max_rows", None, "display.max_columns", None)

import warnings
warnings.filterwarnings("ignore")

#import data with user descriptiomns/review
emmys_df = pd.read_csv('csv/emmys_frequencies.csv')
scores_df = pd.read_csv('csv/final.csv')


### Merge + Ad Hoc Clean up

In [2]:
# Rename columns - Emmys
emmys_df = emmys_df.rename(columns={"Title": "No"})
emmys_df = emmys_df.rename(columns={"Unnamed: 0": "Title"})

In [3]:
# Rename Rotten Tomatoes column
scores_df = scores_df.rename(columns={"Rotten Tomatoes": "Rotten_Tomatoes"})

# Remove irrelevant columns
scores_df = scores_df[scores_df.columns.difference(['Year', 'Age', 'type', 'Disney+', 'Index', 'Prime Video', 'Hulu', 'Netflix'])] 

# Lowercase all titles to align with emmys_df
scores_df["Title"] = scores_df["Title"].str.lower()

In [4]:
# Merge emmys with rotten tomato/IMDb scores
df = emmys_df.merge(scores_df, how='left', on='Title')
df = df.drop(85)

In [5]:
# Convert Rotten Tomatoes column from % to float and make score out of 10 for future plotting
s = df['Rotten_Tomatoes'].str.replace(r'%', r'').astype('float')/100
df['Rotten_Tomatoes']  = pd.to_numeric(df['Rotten_Tomatoes'], errors='coerce').fillna(s)

df['Rotten_Tomatoes'] = df['Rotten_Tomatoes'] * 10


In [6]:
# Give index name
df.index.name = 'Index'

In [7]:
# Save csv
df.to_csv("csv/combined_scores.csv")


In [8]:
df

Unnamed: 0_level_0,Title,No,IMDb,Rotten_Tomatoes
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,game of thrones,30,,
1,veep,24,8.3,9.3
2,the marvelous mrs maisel,21,,
3,saturday night live,20,8.1,
4,handmaid's tale,20,,
5,better call saul,17,8.7,9.7
6,ozark,17,8.4,8.1
7,the crown,17,8.7,8.9
8,barry,15,,
9,fargo,14,8.9,9.6


### Convert to JSON Array for Future Plotting

In [9]:
# Function to convert a CSV to JSON
# Takes the file paths as arguments
def make_json(csvFilePath, jsonFilePath):
     
    # create a dictionary
    data = [] 
     
    # Open a csv reader called DictReader
    with open(csvFilePath, encoding='utf-8') as csvf:
        csvReader = csv.DictReader(csvf)
         
        # Convert each row into a dictionary 
        # and add it to data
        for rows in csvReader:
             
            # Assuming a column named 'No' to
            # be the primary key
            key = rows['Index']
            data.append(rows)
 
    # Open a json writer, and use the json.dumps() 
    # function to dump data
    with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
        jsonf.write(json.dumps(data, indent=4))
         
# Driver Code
 
# Create two file paths
csvFilePath = r'csv/combined_scores.csv'
jsonFilePath = r'plot_scores/scores_array.json'
 
# Call the make_json function
make_json(csvFilePath, jsonFilePath)

In [10]:

df

Unnamed: 0_level_0,Title,No,IMDb,Rotten_Tomatoes
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,game of thrones,30,,
1,veep,24,8.3,9.3
2,the marvelous mrs maisel,21,,
3,saturday night live,20,8.1,
4,handmaid's tale,20,,
5,better call saul,17,8.7,9.7
6,ozark,17,8.4,8.1
7,the crown,17,8.7,8.9
8,barry,15,,
9,fargo,14,8.9,9.6
