In [7]:
import pandas as pd
import numpy as np

df=pd.read_csv('cleaned_movie_quotes.csv')

In [3]:
df.dtypes

text              object
character_name    object
title             object
year              object
genres            object
gender            object
dtype: object

In [8]:
print(df['genres'].head(10).tolist())


["['comedy' 'romance']", "['comedy' 'romance']", "['comedy' 'romance']", "['comedy' 'romance']", "['comedy' 'romance']", "['comedy' 'romance']", "['comedy' 'romance']", "['comedy' 'romance']", "['comedy' 'romance']", "['comedy' 'romance']"]


In [10]:
import string
import ast

def parse_genres(genre_str):
    if not isinstance(genre_str, str):
        return ['unknown']
    # Remove brackets and single quotes
    cleaned = genre_str.strip("[]").replace("'", "").strip()
    # Split by space
    genres = [g.strip() for g in cleaned.split(' ') if g.strip() != '']
    return genres if genres else ['unknown']

def clean_text(text):
    if not isinstance(text, str):
        return ''
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Strip whitespace
    text = text.strip()
    return text

def clean_field(field):
    if not isinstance(field, str):
        return 'unknown'
    field = field.strip()
    if field == '?' or field == '':
        return 'unknown'
    return field

# Apply cleaning
df['text'] = df['text'].apply(clean_text)
df['character_name'] = df['character_name'].apply(clean_field)
df['title'] = df['title'].apply(clean_field)

# Year is object type, so clean it to keep only digits or 'unknown'
def clean_year(year):
    if not isinstance(year, str):
        return 'unknown'
    year = year.strip()
    if year == '?' or year == '':
        return 'unknown'
    # Keep only digits (some years might have weird formatting)
    return ''.join(filter(str.isdigit, year)) or 'unknown'

df['year'] = df['year'].apply(clean_year)

# Clean gender column
df['gender'] = df['gender'].apply(clean_field)

# Parse genres column
df['genres'] = df['genres'].apply(parse_genres)

# Optional: check cleaned output
print(df.head(5))

          text character_name                       title  year  \
0  they do not         BIANCA  10 things i hate about you  1999   
1   they do to        CAMERON  10 things i hate about you  1999   
2    i hope so         BIANCA  10 things i hate about you  1999   
3     she okay        CAMERON  10 things i hate about you  1999   
4      lets go         BIANCA  10 things i hate about you  1999   

              genres gender  
0  [comedy, romance]      f  
1  [comedy, romance]      m  
2  [comedy, romance]      f  
3  [comedy, romance]      m  
4  [comedy, romance]      f  


In [11]:
df.to_csv("cleaned_movie_quotes_final_i_think.csv", index=False)