# Parsing data with ast

import ast
import numpy as np
import pandas as pd

movies_df = pd.read_csv("../../edit_data/cleaned_data/movies.csv")

# Function to parse the string and extract 'name'
def extract_names(data_string):
    if pd.isna(data_string):
        return np.nan
    try:
        # Safely evaluate the string to a Python object
        data_object = ast.literal_eval(data_string)
        if isinstance(data_object, list):
            # Extract 'name' from each dictionary in the list
            names = [item['name'] for item in data_object]
            return names
        elif isinstance(data_object, dict):
            # Extract 'name' from the dictionary
            return [data_object.get('name', np.nan)]
    except (ValueError, SyntaxError):
        return np.nan
    
movies_df['Genres_Parse'] = movies_df['Genres'].apply(extract_names)
movies_df['Belongs_To_Collection_Parse'] = movies_df['Belongs_To_Collection'].apply(extract_names)
movies_df['Spoken_Languages_Parse'] = movies_df['Spoken_Languages'].apply(extract_names)
movies_df['Production_Companies_Parse'] = movies_df['Production_Companies'].apply(extract_names)
movies_df['Production_Countries_Parse'] = movies_df['Production_Countries'].apply(extract_names)

movies_df.head()


### Parse Country Code for ['Production_Countries']

In [None]:
# Function to parse the string and extract names
def extract_country_code(column_string):
    # Safely evaluate the string to a Python object
    column_list = ast.literal_eval(column_string)
    # Extract names
    country_code = [column['iso_3166_1'] for column in column_list]
    return country_code

movies_df['Production_Countries_Code_Parse'] = movies_df['Production_Countries'].apply(extract_country_code)


Parsing Genres
- original codes using replace and json.load

In [None]:
import json

# First - convert ' to "
genres = movies_df.Genres.replace("\'","\"")

# Loop the list in the cell to convert to json data. 
for i in range(len(genres)):
    genres[i] = movies_df.Genres[i].replace("\'","\"")
    genres[i] = json.loads(genres[i])


# parse genres JSON data and save it back as list
all_genres = []

for each_film in genres:
    new_list=[]
    for item in each_film:
        new_item = item['name']
        new_list.append(new_item)
    each_film = new_list
    # print(each_film)
    all_genres.append(each_film)

#assign the list as ['Genres']
movies_df['Genres'] = all_genres
    