In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [20]:
# Load the spells data
spells_df = pd.read_csv('spells.csv', sep=None, engine='python', encoding='utf-8-sig')
spell_incantations = spells_df['Incantation'].dropna().str.strip().str.lower().tolist()

# List of dialogue files, with movie numbers
dialogue_files = {
    'hp1.csv': 1,
    'hp2.csv': 2,
    'hp3.csv': 3
}

# List to gather all data for a combined DataFrame
all_data = []

# Process each dialogue file
for file, movie_number in dialogue_files.items():
    hp_dialogue_df = pd.read_csv(file, sep=None, engine='python', encoding='utf-8-sig')

    # Process each line in the dialogue file
    for index, row in hp_dialogue_df.iterrows():
        line_number = index + 1  # Line number (1-indexed)
        sentence = row['Sentence'].lower()  # Convert sentence to lowercase
        caster = row['Character']  # Character speaking the line
        
        # Check if any known spell incantation is in the sentence
        incantation = next((spell for spell in spell_incantations if spell in sentence), None)
        
        if incantation:
            victim = None
            if " you " in sentence:
                victim = "unspecified target"
            elif "Hermione" in sentence:
                victim = "Hermione"
            elif "Harry" in sentence:
                victim = "Harry"
            elif "Ron" in sentence:
                victim = "Ron"
            
            # Append movie data with movie number
            all_data.append((movie_number, line_number, caster, incantation, victim))

# Create a single DataFrame with an additional 'movie_number' column
combined_df = pd.DataFrame(all_data, columns=["movie_number", "line_number", "caster", "incantation", "victim"])

# Display the combined DataFrame
print(combined_df)

# Optionally, save to CSV
combined_df.to_csv('Combined_Extracted_Spell_Data_With_Movie_Number.csv', index=False)


    movie_number  line_number        caster       incantation victim
0              1          293    Ollivander            finite   None
1              1          366  Mrs. Weasley              pack   None
2              1          405           Ron              pack   None
3              1          431      Hermione     oculus reparo   None
4              1          614         Harry           unknown   None
..           ...          ...           ...               ...    ...
61             3         1555         HARRY  expecto patronum   None
62             3         1562      HERMIONE          bombarda   None
63             3         1602         HARRY              none   None
64             3         1635         HARRY             lumos   None
65             3         1638         HARRY               nox   None

[66 rows x 5 columns]


In [21]:
#turn df to csv and download, uncomment to download what u want
#df_og_version.to_csv("og_version.csv", sep=',', encoding='utf-8')

#df_taylors_version.to_csv("taylors_version.csv", sep=',', encoding='utf-8')

#df_top_5_all_albums_sorted.to_csv("top_5.csv", sep=',', encoding='utf-8')

#df_top_5_all_songs.to_csv("top_5_general.csv", sep=',', encoding='utf-8')

combined_df.to_csv("movies1_3.csv", sep=',', encoding='utf-8')


In [23]:
import pandas as pd

# Load the spells data
spells_df = pd.read_csv('spells.csv', sep=None, engine='python', encoding='utf-8-sig')
spell_incantations = spells_df['Incantation'].dropna().str.strip().str.lower().tolist()

# Load your single dialogue file
dialogue_file_path = 'dialogue.csv'  # Replace with the correct file path
dialogue_df = pd.read_csv(dialogue_file_path, sep=None, engine='python', encoding='utf-8-sig')

# List to store all collected data
all_data = []

# Iterate over each row in the dialogue DataFrame
for index, row in dialogue_df.iterrows():
    line_number = row['Dialogue_ID']  # Get the dialogue ID
    sentence = row['Dialogue'].lower()  # Convert the dialogue to lowercase
    caster = row['Character_ID']  # The ID of the character speaking

    # Check if any known spell incantation is present in the sentence
    incantation = next((spell for spell in spell_incantations if spell in sentence), None)

    if incantation:
        # Identify potential victim based on keywords in the sentence
        victim = None
        if " you " in sentence:
            victim = "unspecified target"
        elif "hermione" in sentence:
            victim = "Hermione"
        elif "harry" in sentence:
            victim = "Harry"
        elif "ron" in sentence:
            victim = "Ron"

        # Append the collected data to the list
        all_data.append((row['Chapter_ID'], line_number, caster, incantation, victim))

# Create a DataFrame from the collected data
combined_df = pd.DataFrame(all_data, columns=["Chapter_ID", "Dialogue_ID", "Character_ID", "Incantation", "Victim"])

# Display the combined DataFrame
print(combined_df)


     Chapter_ID  Dialogue_ID  Character_ID       Incantation  \
0             7          160            43            finite   
1             9          186            26              pack   
2            10          213             2              pack   
3            10          226             3     oculus reparo   
4            14          319             1           unknown   
..          ...          ...           ...               ...   
169         223         7161             1      expelliarmus   
170         225         7273             6  expecto patronum   
171         229         7321             4              none   
172         231         7396             1         confringo   
173         232         7406             1         confringo   

                 Victim  
0                  None  
1                  None  
2    unspecified target  
3    unspecified target  
4                   Ron  
..                  ...  
169                None  
170                 Ron

In [27]:
# Load the character data
character_file_path = 'characters.csv'  # Replace with the correct file path
character_df = pd.read_csv(character_file_path, sep=None, engine='python', encoding='utf-8-sig')

print(character_df)

     Character_ID    Character_Name                Species  Gender  \
0               1      Harry Potter                  Human    Male   
1               2       Ron Weasley                  Human    Male   
2               3  Hermione Granger                  Human  Female   
3               4  Albus Dumbledore                  Human    Male   
4               5     Rubeus Hagrid  Half-Human/Half-Giant    Male   
..            ...               ...                    ...     ...   
161           162            Waiter                    NaN     NaN   
162           163             Boy 2                    NaN     NaN   
163           164             Crowd                    NaN     NaN   
164           165       Gryffindors                    NaN     NaN   
165           166        Professors                    NaN     NaN   

          House              Patronus Wand (Wood)         Wand (Core)  
0    Gryffindor                  Stag       Holly     Phoenix Feather  
1    Gryffindor

In [None]:

# Merge combined_df with character_df to add Character_Name and House
combined_df = combined_df.merge(
    character_df[['Character_ID', 'Character_Name', 'House']],
    left_on='Character_ID',
    right_on='Character_ID',
    how='left'
)

# Drop the redundant 'Character ID' column from the character data
combined_df.drop(columns=['Character_ID'], inplace=True)

# Rename columns for clarity
combined_df.rename(columns={'Character_Name': 'Caster'}, inplace=True)


# Display the updated combined DataFrame
print(combined_df)

KeyError: 'Character_ID'

In [None]:
combined_df = combined_df[combined_df['Incantation'] != 'unknown']
combined_df = combined_df[combined_df['Incantation'] != 'none']



In [31]:
# Group by Character_Name and count the number of spells cast
spell_counts = combined_df.groupby('Caster').size().reset_index(name='Spell_Count')

# Sort the results in descending order of Spell_Count
spell_counts = spell_counts.sort_values(by='Spell_Count', ascending=False)

# Display the grouped and sorted DataFrame
print(spell_counts)

                 Caster  Spell_Count
16         Harry Potter           53
17     Hermione Granger           34
29          Ron Weasley           13
32        Severus Snape            7
27          Remus Lupin            7
1      Albus Dumbledore            7
24   Neville Longbottom            6
12    Gilderoy Lockhart            4
35            Voldemort            3
6          Draco Malfoy            3
8          Fred Weasley            3
33         Sirius Black            2
30        Rubeus Hagrid            2
20        Luna Lovegood            2
18         James Potter            2
2                   All            2
10       George Weasley            2
4   Bellatrix Lestrange            2
5      Dolores Umbridge            2
7       Filius Flitwick            2
15                Guard            1
26           Professors            1
34           Tom Riddle            1
3        Arthur Weasley            1
31     Rufus Scrimgeour            1
28        Rolanda Hooch            1
2