In [2]:
import pandas as pd



In [15]:
base_path = "archive/"

# Function to print first few lines of a file
def print_file_head(file_path, n=5):
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i < n:
                print(f"Line {i+1}: {line.strip()}")
            else:
                break

# Inspect each file
print("movie_titles_metadata.tsv:")
print_file_head(base_path + "movie_titles_metadata.tsv")
print("\nmovie_characters_metadata.tsv:")
print_file_head(base_path + "movie_characters_metadata.tsv")
print("\nmovie_lines.tsv:")
print_file_head(base_path + "movie_lines.tsv")
print("\nmovie_conversations.tsv:")
print_file_head(base_path + "movie_conversations.tsv")

movie_titles_metadata.tsv:
Line 1: m0	10 things i hate about you	1999	6.90	62847	['comedy' 'romance']
Line 2: m1	1492: conquest of paradise	1992	6.20	10421	['adventure' 'biography' 'drama' 'history']
Line 3: m2	15 minutes	2001	6.10	25854	['action' 'crime' 'drama' 'thriller']
Line 4: m3	2001: a space odyssey	1968	8.40	163227	['adventure' 'mystery' 'sci-fi']
Line 5: m4	48 hrs.	1982	6.90	22289	['action' 'comedy' 'crime' 'drama' 'thriller']

movie_characters_metadata.tsv:
Line 1: u0	BIANCA	m0	10 things i hate about you	f	4
Line 2: u1	BRUCE	m0	10 things i hate about you	?	?
Line 3: u2	CAMERON	m0	10 things i hate about you	m	3
Line 4: u3	CHASTITY	m0	10 things i hate about you	?	?
Line 5: u4	JOEY	m0	10 things i hate about you	m	6

movie_lines.tsv:
Line 1: L1045	u0	m0	BIANCA	They do not!
Line 2: L1044	u2	m0	CAMERON	They do to!
Line 3: L985	u0	m0	BIANCA	I hope so.
Line 4: L984	u2	m0	CAMERON	She okay?
Line 5: L925	u0	m0	BIANCA	Let's go.

movie_conversations.tsv:
Line 1: u0	u2	m0	['L194' 'L195' '

In [17]:
base_path = "archive/"

# Load each file using tab as the separator
movie_titles = pd.read_csv(
    base_path + "movie_titles_metadata.tsv",
    sep="\t",
    engine="python",
    header=None,
    encoding="utf-8",
    quoting=3  # Ignore quotes to handle fields like genres
)
characters = pd.read_csv(
    base_path + "movie_characters_metadata.tsv",
    sep="\t",
    engine="python",
    header=None,
    encoding="utf-8",
    quoting=3
)
lines = pd.read_csv(
    base_path + "movie_lines.tsv",
    sep="\t",
    engine="python",
    header=None,
    encoding="utf-8",
    quoting=3
)
conversations = pd.read_csv(
    base_path + "movie_conversations.tsv",
    sep="\t",
    engine="python",
    header=None,
    encoding="utf-8",
    quoting=3
)

# Assign column names for clarity
movie_titles.columns = ["movieID", "title", "year", "rating", "votes", "genres"]
characters.columns = ["characterID", "name", "movieID", "movie_title", "gender", "position"]
lines.columns = ["lineID", "characterID", "movieID", "character_name", "text"]
conversations.columns = ["characterID1", "characterID2", "movieID", "utterances"]

# Print column counts to verify
print("movie_titles columns:", movie_titles.shape[1])
print("characters columns:", characters.shape[1])
print("lines columns:", lines.shape[1])
print("conversations columns:", conversations.shape[1])

# Print first few rows to inspect
print("\nmovie_titles head:\n", movie_titles.head())
print("\ncharacters head:\n", characters.head())
print("\nlines head:\n", lines.head())
print("\nconversations head:\n", conversations.head())

ParserError: Expected 6 fields in line 6565, saw 13

In [19]:
base_path = "archive/"

# Function to read and print specific lines
def inspect_lines(file_path, target_line, context=2):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        start = max(1, target_line - context)
        end = target_line + context + 1
        for i, line in enumerate(lines, 1):
            if start <= i <= end:
                print(f"Line {i}: {line.strip()}")
                if i == target_line:
                    split_fields = line.strip().split('\t')  # Move split outside f-string
                    print(f"  Split fields: {split_fields}")

# Inspect line 6565 and context
print("Inspecting movie_characters_metadata.tsv around line 6565:")
inspect_lines(base_path + "movie_characters_metadata.tsv", 6565)

Inspecting movie_characters_metadata.tsv around line 6565:
Line 6563: u6562	JIMMY	m436	memento	?	?
Line 6564: u6563	LEONARD	m436	memento	m	1
Line 6565: u6564	LEONARD							  *	m436	memento	?	?
  Split fields: ['u6564', 'LEONARD', '', '', '', '', '', '', '  *', 'm436', 'memento', '?', '?']
Line 6566: u6565	LEONARD'S WIFE	m436	memento	?	?
Line 6567: u6566	MRS. JANKIS	m436	memento	?	?
Line 6568: u6567	NATALIE	m436	memento	f	2


In [20]:
import pandas as pd
import csv

base_path = "archive/"

# Function to manually read TSV and fix malformed rows
def read_tsv_manual(file_path, expected_columns):
    data = []
    errors = []
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
        for i, row in enumerate(reader, 1):
            if len(row) == expected_columns:
                data.append(row)
            elif len(row) > expected_columns and file_path.endswith("movie_characters_metadata.tsv"):
                # Fix for characters file: keep first 2 and last 4 fields
                fixed_row = row[:2] + row[-4:]
                if len(fixed_row) == expected_columns:
                    data.append(fixed_row)
                else:
                    errors.append((i, len(row), row))
            else:
                errors.append((i, len(row), row))
    return pd.DataFrame(data), errors

# Load files
movie_titles, errors_titles = read_tsv_manual(base_path + "movie_titles_metadata.tsv", 6)
characters, errors_characters = read_tsv_manual(base_path + "movie_characters_metadata.tsv", 6)
lines, errors_lines = read_tsv_manual(base_path + "movie_lines.tsv", 5)
conversations, errors_conversations = read_tsv_manual(base_path + "movie_conversations.tsv", 4)

# Assign column names
movie_titles.columns = ["movieID", "title", "year", "rating", "votes", "genres"]
characters.columns = ["characterID", "name", "movieID", "movie_title", "gender", "position"]
lines.columns = ["lineID", "characterID", "movieID", "character_name", "text"]
conversations.columns = ["characterID1", "characterID2", "movieID", "utterances"]

# Print column counts to verify
print("movie_titles columns:", movie_titles.shape[1])
print("characters columns:", characters.shape[1])
print("lines columns:", lines.shape[1])
print("conversations columns:", conversations.shape[1])

# Print errors
if errors_titles:
    print("\nErrors in movie_titles_metadata.tsv:")
    for line_num, field_count, row in errors_titles:
        print(f"Line {line_num}: Expected 6 fields, got {field_count}: {row}")
if errors_characters:
    print("\nErrors in movie_characters_metadata.tsv:")
    for line_num, field_count, row in errors_characters:
        print(f"Line {line_num}: Expected 6 fields, got {field_count}: {row}")
if errors_lines:
    print("\nErrors in movie_lines.tsv:")
    for line_num, field_count, row in errors_lines:
        print(f"Line {line_num}: Expected 5 fields, got {field_count}: {row}")
if errors_conversations:
    print("\nErrors in movie_conversations.tsv:")
    for line_num, field_count, row in errors_conversations:
        print(f"Line {line_num}: Expected 4 fields, got {field_count}: {row}")

# Print first few rows to inspect
print("\nmovie_titles head:\n", movie_titles.head())
print("\ncharacters head:\n", characters.head())
print("\nlines head:\n", lines.head())
print("\nconversations head:\n", conversations.head())

movie_titles columns: 6
characters columns: 6
lines columns: 5
conversations columns: 4

Errors in movie_lines.tsv:
Line 32774: Expected 5 fields, got 7: ['L229891', 'u1036', 'm68', 'ALEXANDER', "By Grabthar's Hammer this is true. 159", 'NT. LIVING ROOM - SOMEWHERE - NIGHT', '159']
Line 32837: Expected 5 fields, got 6: ['L229706', 'u1042', 'm68', 'JASON', 'BRANDON!', 'TIME TO GO!']
Line 32876: Expected 5 fields, got 6: ['L229857', 'u1041', 'm68', 'GWEN', 'All systems are working Commander. ~ -cc', "PINK) -' C -"]
Line 33069: Expected 5 fields, got 6: ['L229881', 'u1049', 'm68', 'TOMMY', 'I see them!  I see them! RD STREET', 'PASADENA 57']
Line 33071: Expected 5 fields, got 6: ['L229801', 'u1041', 'm68', 'GWEN', "What are you doing? What are thev doino? ~7C INT. SARRIS' SHIP", 'h37C']
Line 36207: Expected 5 fields, got 6: ['L237881', 'u1117', 'm73', 'REDBEARD', 'It would have been a beautiful bridge John. I never noticed before occupied with other business I', "suppose...  ...never real

In [22]:
movie_titles.columns = ['movie_id', 'title', 'year', 'imdb_rating', 'imdb_votes', 'genres']
characters.columns = ['character_id', 'character_name', 'movie_id', 'movie_title', 'gender', 'credit_position']
lines.columns = ['line_id', 'character_id', 'movie_id', 'character_name', 'text']
conversations.columns = ['char1_id', 'char2_id', 'movie_id', 'utterance_ids']


In [23]:
# Merge lines with character metadata
lines_chars = pd.merge(lines, characters[['character_id', 'gender']], on='character_id', how='left')

# Merge with movie metadata
quotes_with_metadata = pd.merge(lines_chars, movie_titles, on='movie_id', how='left')

# Final useful columns
quotes_df = quotes_with_metadata[['text', 'character_name', 'title', 'year', 'genres', 'gender']]


In [24]:
quotes_df

Unnamed: 0,text,character_name,title,year,genres,gender
0,They do not!,BIANCA,10 things i hate about you,1999,['comedy' 'romance'],f
1,They do to!,CAMERON,10 things i hate about you,1999,['comedy' 'romance'],m
2,I hope so.,BIANCA,10 things i hate about you,1999,['comedy' 'romance'],f
3,She okay?,CAMERON,10 things i hate about you,1999,['comedy' 'romance'],m
4,Let's go.,BIANCA,10 things i hate about you,1999,['comedy' 'romance'],f
...,...,...,...,...,...,...
304538,Lord Chelmsford seems to want me to stay back ...,DURNFORD,zulu dawn,1979,['action' 'adventure' 'drama' 'history' 'war'],?
304539,I'm to take the Sikali with the main column to...,VEREKER,zulu dawn,1979,['action' 'adventure' 'drama' 'history' 'war'],?
304540,Your orders Mr Vereker?,DURNFORD,zulu dawn,1979,['action' 'adventure' 'drama' 'history' 'war'],?
304541,Good ones yes Mr Vereker. Gentlemen who can ri...,DURNFORD,zulu dawn,1979,['action' 'adventure' 'drama' 'history' 'war'],?


In [25]:
def clean_quote(text):
    return text.strip().lower().replace('"', '').replace("'", "").replace("?", "").replace("!", "")

quotes_df['clean_text'] = quotes_df['text'].apply(clean_quote)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  quotes_df['clean_text'] = quotes_df['text'].apply(clean_quote)


In [26]:
quotes_df

Unnamed: 0,text,character_name,title,year,genres,gender,clean_text
0,They do not!,BIANCA,10 things i hate about you,1999,['comedy' 'romance'],f,they do not
1,They do to!,CAMERON,10 things i hate about you,1999,['comedy' 'romance'],m,they do to
2,I hope so.,BIANCA,10 things i hate about you,1999,['comedy' 'romance'],f,i hope so.
3,She okay?,CAMERON,10 things i hate about you,1999,['comedy' 'romance'],m,she okay
4,Let's go.,BIANCA,10 things i hate about you,1999,['comedy' 'romance'],f,lets go.
...,...,...,...,...,...,...,...
304538,Lord Chelmsford seems to want me to stay back ...,DURNFORD,zulu dawn,1979,['action' 'adventure' 'drama' 'history' 'war'],?,lord chelmsford seems to want me to stay back ...
304539,I'm to take the Sikali with the main column to...,VEREKER,zulu dawn,1979,['action' 'adventure' 'drama' 'history' 'war'],?,im to take the sikali with the main column to ...
304540,Your orders Mr Vereker?,DURNFORD,zulu dawn,1979,['action' 'adventure' 'drama' 'history' 'war'],?,your orders mr vereker
304541,Good ones yes Mr Vereker. Gentlemen who can ri...,DURNFORD,zulu dawn,1979,['action' 'adventure' 'drama' 'history' 'war'],?,good ones yes mr vereker. gentlemen who can ri...


In [27]:
df = quotes_df.drop('clean_text', axis=1)
df.to_csv("cleaned_movie_quotes.csv", index=False)
