# Conversational Recommender System : : Knowledge Graph

Mount your drive with redial_dataset.zip and add its location, then just run all the blocks

In [1]:
import zipfile

dataset_file_location = './data/input/redial_dataset.zip'

with zipfile.ZipFile( dataset_file_location, 'r') as z:
    z.extractall()

print("Item Loaded")

Item Loaded


# Preprocessing

In [2]:
%pip install jsonlines pandas

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import json

# Function to read a JSONL file and return a list of dictionaries
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

# Function to write a list of dictionaries to a JSONL file
def write_jsonl(file_path, data):
    with open(file_path, 'w') as file:
        for item in data:
            file.write(json.dumps(item) + '\n')

# Input file paths
file1_path = './test_data.jsonl'
file2_path = './train_data.jsonl'

# Read data from both JSONL files
data_from_file1 = read_jsonl(file1_path)
data_from_file2 = read_jsonl(file2_path)

# Combine the data from both files
combined_data = data_from_file1 + data_from_file2

# Output file path for the combined data
output_file_path = 'true_train.jsonl'

# Write the combined data to the output JSONL file
write_jsonl(output_file_path, combined_data)


# Movie Extraction - Subjects

movies are directly given in movies_mentions.csv so no need to do much

In [4]:
import json

# Assuming extracted_data is a dictionary containing the data from the JSON Lines file
extracted_data = {}

# Read data from the JSON Lines file with explicit encoding
with open('./data/generated/movies_with_year.jsonl', 'r', encoding='utf-8') as file:
    for line in file:
        movie_info = json.loads(line)
        movie_id = len(extracted_data) + 1  # You can modify this based on your actual data structure
        extracted_data[movie_id] = movie_info

# Initialize the movie_names_data array
movie_names_data = []

# Modify the code to create the desired list
for movie_id, movie_info in extracted_data.items():
    title = movie_info['name']
    release_year = movie_info['year']
    movie_names_data.append({"title": title, "release_year": release_year})

# Print or use the modified list as needed
print("Length of movie_names_data:", len(movie_names_data))


Length of movie_names_data: 6629


# Text Extraction - Objects and relations

In [5]:
# Initialize a list to store the extracted data
messages_text = []

In [6]:
# Open the JSON lines file and read it line by line
with open('./data/generated/true_train.jsonl', 'r') as json_file:
    for line in json_file:
        # Load and parse each JSON object
        entry = json.loads(line)

        # Check if the entry contains the required fields
        if "conversationId" in entry and "messages" in entry:
            conversation_id = entry["conversationId"]
            messages = [message["text"] for message in entry["messages"]]

            # Create a dictionary for the conversation data
            conversation_data = {
                "conversationId": conversation_id,
                "messages": messages
            }

            # Append the conversation data to the list
            messages_text.append(conversation_data)

In [7]:
# Write the extracted data to an output .jsonl file
with open('./data/generated/messages_text_data.jsonl', 'w') as output_file:
    for conversation_data in messages_text:
        output_file.write(json.dumps(conversation_data) + '\n')

In [8]:
import json

# Initialize a list to store the extracted data
conversation_data_list = []

In [9]:
# Open the JSON lines file and read it line by line
with open('./data/generated/true_train.jsonl', 'r') as json_file:
    for line in json_file:
        # Load and parse each JSON object
        entry = json.loads(line)

        # Check if the entry contains the required fields
        if "conversationId" in entry and "messages" in entry:
            conversation_id = entry["conversationId"]
            messages = entry["messages"]

            # Initialize variables to store concatenated messages
            concatenated_messages = []
            current_sender_id = ""
            current_message = ""

            # Iterate through messages
            for message in messages:
                sender_id = message['senderWorkerId']
                text = message['text']

                # Check if senderWorkerID is the same as the previous message
                if sender_id == current_sender_id:
                    # Concatenate the messages
                    current_message += " " + text
                else:
                    concatenated_messages.append(current_message)
                    current_message = text


                # Start a new concatenated message
                current_sender_id = sender_id

            # Last message should be added
            concatenated_messages.append(current_message)

            # Create a dictionary for the conversation data
            conversation_data = {
                "conversationId": conversation_id,
                "messages": concatenated_messages
            }

            # Append the conversation data to the list
            conversation_data_list.append(conversation_data)

In [10]:
conversation_data_list[1]

{'conversationId': '20041',
 'messages': ['',
  'Hello!',
  'Hello!',
  'What kind of movies do you like?',
  'I am looking for a movie recommendation.   When I was younger I really enjoyed the @77161',
  'Oh, you like scary movies? I recently watched @204334',
  'I also enjoyed watching @132562',
  'It was really good for a new &quot;scary movie&quot;',
  'I do enjoy some of the newer horror movies that I have seen as well.',
  'I heard that @205430 is good. It is still in theaters though.',
  'I really liked the movie @125431',
  'Me, too! It was really creepy, but I thought it was good!',
  'Or @118338 I saw while in theaters, this was a very good movie.  It had me on the edge of my seat for the whole show.',
  "I'm not sure if I saw that one, I'll have to check into it. Sounds familiar, but not sure. Thank you for your suggestions!",
  'Are there any comedies that you would suggest?',
  'Sure! I like comedies a lot. I like movies like @175203 and @111776 , but I also like @187061 a

In [11]:
# Write the extracted data to an output .jsonl file
with open('./data/generated/output.jsonl', 'w') as output_file:
    for conversation_data in conversation_data_list:
        output_file.write(json.dumps(conversation_data) + '\n')

Can Be further Processed for movie to movie relation

## Objects

### People

In [12]:
# Function to read a JSONL file and return a list of dictionaries

relations_people_data = []

def read_jsonl(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            relations_people_data.append(json.loads(line))
    return relations_people_data

read_jsonl("./bin/input/filtered_actor_names.jsonl")

[{'391': ['Alec Baldwin']},
 {'395': []},
 {'397': []},
 {'405': ['Drew Barrymore']},
 {'407': []},
 {'411': []},
 {'415': ['New Orleans']},
 {'418': ['Quentin Tarantino', 'Stephen King']},
 {'419': ['Will Smith', 'Tom Cruise', 'Jeff Bridges']},
 {'426': ['Kevin Hart']},
 {'450': []},
 {'455': []},
 {'458': ['Kevin Hart', 'Will Ferrel', 'Ice Cube', 'Mark Wahlberg']},
 {'459': ['Martin Lawrence', 'Kevin Hart', 'Adam Sandler', 'Damon Wayans']},
 {'461': []},
 {'462': ['Jack Black', 'Jim Carey']},
 {'463': ['Bill Murphy', 'Leslie Nielsen']},
 {'464': ['Tom Hardy', 'Reese Witherspoon', 'Tom Cruise', 'Chris Pine']},
 {'465': ['Bruce Willis', 'Samuel Jackson']},
 {'466': ['Mark Rylance']},
 {'467': []},
 {'468': ['Kevin Hart']},
 {'469': []},
 {'470': ['Jim Carrey']},
 {'471': ['Taylor Kitsch', 'Salma Hayek', 'Blake Lively']},
 {'474': []},
 {'475': ['Reese Witherspoon']},
 {'477': ['Michelle Rodriguez']},
 {'480': []},
 {'481': ['Alfred Hitchcock']},
 {'482': ['Queen Latifah', 'John Travolt

In [13]:
filtered_data = [item for item in relations_people_data if any(item.values())]

print(filtered_data)

[{'391': ['Alec Baldwin']}, {'405': ['Drew Barrymore']}, {'415': ['New Orleans']}, {'418': ['Quentin Tarantino', 'Stephen King']}, {'419': ['Will Smith', 'Tom Cruise', 'Jeff Bridges']}, {'426': ['Kevin Hart']}, {'458': ['Kevin Hart', 'Will Ferrel', 'Ice Cube', 'Mark Wahlberg']}, {'459': ['Martin Lawrence', 'Kevin Hart', 'Adam Sandler', 'Damon Wayans']}, {'462': ['Jack Black', 'Jim Carey']}, {'463': ['Bill Murphy', 'Leslie Nielsen']}, {'464': ['Tom Hardy', 'Reese Witherspoon', 'Tom Cruise', 'Chris Pine']}, {'465': ['Bruce Willis', 'Samuel Jackson']}, {'466': ['Mark Rylance']}, {'468': ['Kevin Hart']}, {'470': ['Jim Carrey']}, {'471': ['Taylor Kitsch', 'Salma Hayek', 'Blake Lively']}, {'475': ['Reese Witherspoon']}, {'477': ['Michelle Rodriguez']}, {'481': ['Alfred Hitchcock']}, {'482': ['Queen Latifah', 'John Travolta']}, {'484': ['Eddie Murphy']}, {'485': ['Catherine Zeta Jones', 'Queen Latifa']}, {'502': ['Kevin Hart', 'Kurt Russel', 'Will Farrell', 'Morgan Freeman']}, {'503': ['Billy

In [14]:
unique_names = set()

for entry in filtered_data:
    for key, value in entry.items():
        unique_names.update(value)

print(unique_names)

{'Julianne Hough', 'Daniel Day Lewis', 'Alan Turing', 'Barbra Streisand', 'Alice Springs', 'John Dillinger', 'The Snowman', 'Tom Hank', 'Woody Allen', 'Jim Cameron', 'Diane Lane', 'Ron Livingston', 'Nicole Kidman', 'Zoe Saldana', 'Taylor Schilling', 'Taylor Kitsch', 'James Cameron', 'Tim Robbins', 'Laura Linney', 'Andy Samberg', 'Minnie Driver', 'Goldie Hawn', 'Molly Ringwald', 'Alexander Skarsgard', 'Billy Wilder', 'Emile Hirsch', 'June Carter', 'Rebel Wilson', 'Cameron Crowe', 'Charlie Kelly', 'Judd Apatow', 'Nic Cage', 'Mark Wahlberg', 'Scarlett Johansson', 'Tom Arnold', 'Milla Jovovich', 'Jack Nicholsen', 'Denzel Washington', 'Max Steel', 'Sean Bean', 'Robert Mitchum', 'Paula Patton', 'Patrick Stewart', 'Rachael Adams', 'How Can', 'The Spy', 'The Martian', 'Tommy Lee Jones', 'Timothy Dalton', 'The Office', 'Martin Lawrence', 'Rosanne Barr', 'King George', 'Monty Python', 'Jake Gyllenhaal', 'Vince Vaughn', 'Jude Law', 'Gary Farmer', 'Billy Howle', 'Chris Rock', 'Matt Smith', 'Steve 

In [15]:
final_names = sorted(unique_names)
print(final_names)

['Aaron Swartz', 'Abbie Cornish', 'Abby Ryder Fortson', 'Abigail Breslin', 'Ace Ventura', 'Adam Driver', 'Adam Pascal', 'Adam Sander', 'Adam Sandler', 'Adam Scott', 'Adeel Akhtar', 'Adrian Scarborough', 'Adrien Brody', 'Agatha Christie', 'Al Capone', 'Al Pacino', 'Alan Alda', 'Alan Arkin', 'Alan Rickman', 'Alan Turing', 'Albert Brooks', 'Alec Baldwin', 'Alejandro Inarritu', 'Alex Kurtzman', 'Alex Roe', 'Alexander Skarsgard', 'Alexie Gilmore', 'Alfie Allen', 'Alfred Hitchcock', 'Alice Springs', 'Alicia Eve', 'All Super', 'Ally Sheedy', 'Amanda Bynes', 'Amanda Seyfried', 'Amanda Silver', 'American Beauty', 'Amy Adams', 'Amy Poehler', 'Amy Schumer', 'Andre Benjamin', 'Andrei Tarkovsky', 'Andrew Garfield', 'Andrew James Allen', 'Andrew Lincoln', 'Andy Garcia', 'Andy Kaufman', 'Andy Samberg', 'Ane Dahl Torp', 'Ang Lee', 'Angelina Jole', 'Angelina Jolie', 'Animated Movie', 'Anna Hathaway', 'Anna Kendrick', 'Anne Hathaway', 'Ansel Elgort', 'Anthony Hopkins', 'Anthony Rapp', 'Anton Yelchin', '

In [16]:
relations_people_data_tmdb = [] # from tmdb

def read_jsonl(file_path, encoding='utf-8'):
    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            relations_people_data_tmdb.append(json.loads(line))
    return relations_people_data_tmdb

read_jsonl("./data/input/cast_directors_writers_final.jsonl")

[{'name': "Charlotte's Web",
  'year': '2006',
  'cast': ['Dakota Fanning',
   'Julia Roberts',
   'Steve Buscemi',
   'John Cleese',
   'Oprah Winfrey'],
  'director': 'Gary Winick',
  'writer': 'N/A'},
 {'name': 'Everything You Always Wanted to Know About Sex*',
  'year': '1972',
  'cast': ['Woody Allen',
   'John Carradine',
   'Lou Jacobi',
   'Louise Lasser',
   'Anthony Quayle'],
  'director': 'Woody Allen',
  'writer': 'N/A'},
 {'name': 'Rock-a-Doodle',
  'year': '1991',
  'cast': ['Glen Campbell',
   'Toby Scott Ganger',
   'Christopher Plummer',
   'Kathryn Holcomb',
   'Stan Ivar'],
  'director': 'Don Bluth',
  'writer': 'Don Bluth'},
 {'name': 'Too Late',
  'year': 'None',
  'cast': ['John Hawkes',
   'Vail Bloom',
   'Joanna Cassidy',
   'Jeff Fahey',
   'Robert Forster'],
  'director': 'Dennis Hauck',
  'writer': 'Dennis Hauck'},
 {'name': 'Law Abiding Citizen',
  'year': '2009',
  'cast': ['Jamie Foxx',
   'Gerard Butler',
   'Colm Meaney',
   'Bruce McGill',
   'Leslie B

In [17]:
# Create a set of standardized names
standard_names_set = set()
for name in final_names:
    standard_names_set.add(name.lower())  # Convert to lowercase for case-insensitive comparison

# Check cast, director, and writer fields
for movie in relations_people_data_tmdb:
    movie['cast'] = [name for name in movie['cast'] if any(standard_name in name.lower() for standard_name in standard_names_set)]
    
    # Check if 'director' is a single string or a list
    if isinstance(movie['director'], str):
        movie['director'] = [name for name in [movie['director']] if any(standard_name in name.lower() for standard_name in standard_names_set)]
    else:
        movie['director'] = [name for name in movie['director'] if any(standard_name in name.lower() for standard_name in standard_names_set)]
    
    # Check if 'writer' is a single string or a list
    if isinstance(movie['writer'], str):
        movie['writer'] = [name for name in [movie['writer']] if any(standard_name in name.lower() for standard_name in standard_names_set)]
    else:
        movie['writer'] = [name for name in movie['writer'] if any(standard_name in name.lower() for standard_name in standard_names_set)]

# Print or use the modified data as needed
print(relations_people_data_tmdb)

[{'name': "Charlotte's Web", 'year': '2006', 'cast': ['Dakota Fanning', 'Julia Roberts', 'Steve Buscemi', 'Oprah Winfrey'], 'director': [], 'writer': []}, {'name': 'Everything You Always Wanted to Know About Sex*', 'year': '1972', 'cast': ['Woody Allen'], 'director': ['Woody Allen'], 'writer': []}, {'name': 'Rock-a-Doodle', 'year': '1991', 'cast': ['Glen Campbell'], 'director': ['Don Bluth'], 'writer': ['Don Bluth']}, {'name': 'Too Late', 'year': 'None', 'cast': [], 'director': [], 'writer': []}, {'name': 'Law Abiding Citizen', 'year': '2009', 'cast': ['Jamie Foxx', 'Gerard Butler'], 'director': [], 'writer': []}, {'name': 'The Black Mask', 'year': '1935', 'cast': [], 'director': [], 'writer': []}, {'name': 'Club Dread', 'year': '2004', 'cast': [], 'director': [], 'writer': []}, {'name': 'Moana', 'year': 'None', 'cast': ['Dwayne Johnson'], 'director': [], 'writer': []}, {'name': 'Signs', 'year': '2002', 'cast': ['Mel Gibson', 'Joaquin Phoenix', 'Abigail Breslin'], 'director': ['M. Nigh

In [18]:
unique_names = set()

for movie in relations_people_data_tmdb:
    unique_names.update(movie['cast'])
    unique_names.update(movie['director'])
    unique_names.update(movie['writer'])

# Remove empty strings from the set
unique_names.discard('')

print(unique_names)

{'Julianne Hough', 'Barbra Streisand', 'Woody Allen', 'Ron Livingston', 'Diane Lane', 'Nicole Kidman', 'Taylor Schilling', 'James Cameron', 'Taylor Kitsch', 'Tim Robbins', 'Laura Linney', 'Brooke Shields', 'Andy Samberg', 'Minnie Driver', 'Goldie Hawn', 'Molly Ringwald', 'Billy Wilder', 'Emile Hirsch', 'Rebel Wilson', 'Cameron Crowe', 'Judd Apatow', 'Scarlett Johansson', 'Mark Wahlberg', 'Tom Arnold', 'Milla Jovovich', 'Denzel Washington', 'Sean Bean', 'Robert Mitchum', 'Byron Howard', 'Paula Patton', 'Patrick Stewart', 'Priscilla Lawson', 'Tommy Lee Jones', 'Timothy Dalton', 'Martin Lawrence', 'Jake Gyllenhaal', 'Vince Vaughn', 'Jude Law', 'Matt Smith', 'Gary Farmer', 'Chris Rock', 'Billy Howle', 'Sean Penn', 'Kevin Hart', 'Christina Hendricks', 'Kim Soo-ro', 'Nick Swardson', 'Donald Glover', 'Kevin Spacey', 'Sandra Bullock', 'Shane West', 'Lee Pace', 'Winston Churchill', 'Bette Davis', 'Alec Baldwin', 'Forest Whitaker', 'Amy Poehler', 'John Carpenter', 'Kim Novak', 'John Candy', 'Deb

In [19]:
print(unique_names)

{'Julianne Hough', 'Barbra Streisand', 'Woody Allen', 'Ron Livingston', 'Diane Lane', 'Nicole Kidman', 'Taylor Schilling', 'James Cameron', 'Taylor Kitsch', 'Tim Robbins', 'Laura Linney', 'Brooke Shields', 'Andy Samberg', 'Minnie Driver', 'Goldie Hawn', 'Molly Ringwald', 'Billy Wilder', 'Emile Hirsch', 'Rebel Wilson', 'Cameron Crowe', 'Judd Apatow', 'Scarlett Johansson', 'Mark Wahlberg', 'Tom Arnold', 'Milla Jovovich', 'Denzel Washington', 'Sean Bean', 'Robert Mitchum', 'Byron Howard', 'Paula Patton', 'Patrick Stewart', 'Priscilla Lawson', 'Tommy Lee Jones', 'Timothy Dalton', 'Martin Lawrence', 'Jake Gyllenhaal', 'Vince Vaughn', 'Jude Law', 'Matt Smith', 'Gary Farmer', 'Chris Rock', 'Billy Howle', 'Sean Penn', 'Kevin Hart', 'Christina Hendricks', 'Kim Soo-ro', 'Nick Swardson', 'Donald Glover', 'Kevin Spacey', 'Sandra Bullock', 'Shane West', 'Lee Pace', 'Winston Churchill', 'Bette Davis', 'Alec Baldwin', 'Forest Whitaker', 'Amy Poehler', 'John Carpenter', 'Kim Novak', 'John Candy', 'Deb

In [20]:
unique_people_list = list(unique_names)

Unique Years

In [21]:
unique_years = set()

for year in relations_people_data_tmdb:
    unique_years.add(year['year'])

# Remove empty strings from the set
unique_years.discard('')
unique_years.discard('None')

print(unique_years)

{'1932', '1916', '1968', '1920', '1915', '1987', '2014', '1988', '1967', '1981', '1911', '1971', '1996', '1929', '2016', '1993', '2012', '1927', '2009', '2019', '2010', '1977', '1948', '1947', '1958', '1978', '1952', '2015', '2003', '1931', '1910', '1970', '2008', '1936', '1955', '1992', '1990', '1939', '1950', '1934', '1951', '1912', '1933', '1902', '1938', '1979', '1989', '1937', '1991', '1964', '1945', '1930', '1976', '1984', '1980', '1974', '1921', '1935', '2011', '1972', '1956', '1999', '1946', '2004', '1975', '2001', '1940', '1941', '2002', '2006', '2018', '2000', '1994', '1959', '2013', '1969', '1986', '1925', '1963', '1961', '1913', '1957', '1919', '1998', '1966', '1960', '2017', '1928', '2007', '2005', '1942', '1926', '1954', '1924', '1982', '1983', '1953', '1944', '1965', '1973', '1949', '1985', '1923', '1922', '1943', '1962', '1997', '1995', '1914'}


### Genres

In [22]:
# Function to read a JSONL file and return a list of dictionaries
relations_data = []

def read_jsonl(file_path, encoding='utf-8'):
    with open(file_path, 'r', encoding=encoding) as file:
        for line in file:
            relations_data.append(json.loads(line))
    return relations_data

read_jsonl("./bin/generated/movies_with_year_genre.jsonl")

[{'name': "Charlotte's Web",
  'genres': ['Comedy', 'Family', 'Fantasy'],
  'year': '2006'},
 {'name': 'Everything You Always Wanted to Know About Sex*',
  'genres': ['Comedy'],
  'year': '1972'},
 {'name': 'Rock-a-Doodle',
  'genres': ['Comedy', 'Family', 'Animation', 'Fantasy', 'Music'],
  'year': '1991'},
 {'name': 'Too Late', 'genres': ['Drama', 'Crime', 'Mystery'], 'year': 'None'},
 {'name': 'Law Abiding Citizen',
  'genres': ['Drama', 'Crime', 'Thriller'],
  'year': '2009'},
 {'name': 'The Black Mask', 'genres': ['Crime'], 'year': '1935'},
 {'name': 'Club Dread', 'genres': ['Comedy', 'Horror'], 'year': '2004'},
 {'name': 'Moana',
  'genres': ['Adventure', 'Comedy', 'Family', 'Fantasy'],
  'year': 'None'},
 {'name': 'Signs',
  'genres': ['Drama', 'Thriller', 'Science Fiction', 'Mystery'],
  'year': '2002'},
 {'name': 'Beyond the Valley of the Dolls',
  'genres': ['Comedy', 'Drama', 'Music'],
  'year': '1970'},
 {'name': 'Beastly',
  'genres': ['Drama', 'Fantasy', 'Romance'],
  'ye

In [23]:
filtered_data = relations_data

print(filtered_data)

[{'name': "Charlotte's Web", 'genres': ['Comedy', 'Family', 'Fantasy'], 'year': '2006'}, {'name': 'Everything You Always Wanted to Know About Sex*', 'genres': ['Comedy'], 'year': '1972'}, {'name': 'Rock-a-Doodle', 'genres': ['Comedy', 'Family', 'Animation', 'Fantasy', 'Music'], 'year': '1991'}, {'name': 'Too Late', 'genres': ['Drama', 'Crime', 'Mystery'], 'year': 'None'}, {'name': 'Law Abiding Citizen', 'genres': ['Drama', 'Crime', 'Thriller'], 'year': '2009'}, {'name': 'The Black Mask', 'genres': ['Crime'], 'year': '1935'}, {'name': 'Club Dread', 'genres': ['Comedy', 'Horror'], 'year': '2004'}, {'name': 'Moana', 'genres': ['Adventure', 'Comedy', 'Family', 'Fantasy'], 'year': 'None'}, {'name': 'Signs', 'genres': ['Drama', 'Thriller', 'Science Fiction', 'Mystery'], 'year': '2002'}, {'name': 'Beyond the Valley of the Dolls', 'genres': ['Comedy', 'Drama', 'Music'], 'year': '1970'}, {'name': 'Beastly', 'genres': ['Drama', 'Fantasy', 'Romance'], 'year': '2011'}, {'name': 'Victoria', 'genres

In [24]:
unique_genres = set()

for movie in filtered_data:
    unique_genres.update(movie['genres'])

unique_genres_list = list(unique_genres)

print(unique_genres_list)


['Music', 'Animation', 'TV Movie', 'Mystery', 'Adventure', 'War', 'Fantasy', 'Crime', 'Drama', 'Family', 'Thriller', 'Action', 'Comedy', 'misc', 'Science Fiction', 'Documentary', 'Western', 'History', 'Horror', 'Romance']


## Relations

In [25]:
print(relations_people_data_tmdb[0])

{'name': "Charlotte's Web", 'year': '2006', 'cast': ['Dakota Fanning', 'Julia Roberts', 'Steve Buscemi', 'Oprah Winfrey'], 'director': [], 'writer': []}


In [26]:
relations_list = []

for movie_info in filtered_data:
    name = movie_info['name']
    genre = movie_info['genres']
    year = movie_info['year']

    temp_data = next((item for item in relations_people_data_tmdb if item['name'] == name and item['year']==year), {})
    cast = temp_data.get('cast', [])
    director = temp_data.get('director', [])
    writer = temp_data.get('writer', [])

    relations_list.append({
        'name': name,
        'genre': genre,
        'cast': cast,
        'director': director,
        'writer': writer,
        'year': year
    })

for movie_info in relations_list:
    if movie_info['name'] == "Insidious":
        print(movie_info)

{'name': 'Insidious', 'genre': ['Horror', 'Thriller'], 'cast': ['Patrick Wilson', 'Rose Byrne', 'Ty Simpkins'], 'director': ['James Wan'], 'writer': [], 'year': 'None'}
{'name': 'Insidious', 'genre': ['Horror', 'Thriller'], 'cast': ['Patrick Wilson', 'Rose Byrne', 'Ty Simpkins'], 'director': ['James Wan'], 'writer': [], 'year': '2011'}
{'name': 'Insidious', 'genre': ['Horror', 'Thriller'], 'cast': ['Patrick Wilson', 'Rose Byrne', 'Ty Simpkins'], 'director': ['James Wan'], 'writer': [], 'year': '2010'}


In [27]:
temp_data

{'name': 'Justice', 'year': '1914', 'cast': [], 'director': [], 'writer': []}

In [28]:
relations_list

[{'name': "Charlotte's Web",
  'genre': ['Comedy', 'Family', 'Fantasy'],
  'cast': ['Dakota Fanning',
   'Julia Roberts',
   'Steve Buscemi',
   'Oprah Winfrey'],
  'director': [],
  'writer': [],
  'year': '2006'},
 {'name': 'Everything You Always Wanted to Know About Sex*',
  'genre': ['Comedy'],
  'cast': ['Woody Allen'],
  'director': ['Woody Allen'],
  'writer': [],
  'year': '1972'},
 {'name': 'Rock-a-Doodle',
  'genre': ['Comedy', 'Family', 'Animation', 'Fantasy', 'Music'],
  'cast': ['Glen Campbell'],
  'director': ['Don Bluth'],
  'writer': ['Don Bluth'],
  'year': '1991'},
 {'name': 'Too Late',
  'genre': ['Drama', 'Crime', 'Mystery'],
  'cast': [],
  'director': [],
  'writer': [],
  'year': 'None'},
 {'name': 'Law Abiding Citizen',
  'genre': ['Drama', 'Crime', 'Thriller'],
  'cast': ['Jamie Foxx', 'Gerard Butler'],
  'director': [],
  'writer': [],
  'year': '2009'},
 {'name': 'The Black Mask',
  'genre': ['Crime'],
  'cast': [],
  'director': [],
  'writer': [],
  'year':

# Feeding to the graph

In [29]:
%pip install py2neo

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [31]:
from py2neo import Graph, Node, Relationship

class MovieGraph:
    def __init__(self):
        self.graph = Graph("bolt://localhost:7687", user="neo4j", password="12345678")

    def create_movie_nodes(self, movie_data):
        for movie_info in movie_data:
            movie_node = Node("Movie", title=movie_info["title"], release_year=movie_info["release_year"])
            self.graph.create(movie_node)

    def create_genre_nodes(self, genre_data):
        for genre_name in genre_data:
            genre_node = Node("Genre", title=genre_name)
            self.graph.create(genre_node)

    def create_people_nodes(self, people_data):
        for person_name in people_data:
            person_node = Node("Person", name=person_name)
            self.graph.create(person_node)

    def create_year_nodes(self, year_data):
        for year in year_data:
            year_node = Node("Year", release_year=year)
            self.graph.create(year_node)

    def create_relationships(self, relationship_data):
        for movie_info in relationship_data:
            movie_node = self.graph.nodes.match(title=movie_info["name"], release_year=movie_info["year"]).first()

            if movie_node:
                # Create relationships for genres
                for genre in movie_info.get("genre", []):
                    genre_node = self.graph.nodes.match(title=genre).first()

                    if genre_node:
                        genre_relationship = Relationship(movie_node, "HAS_GENRE", genre_node)
                        self.graph.create(genre_relationship)
                        
                        # Bidirectional relationship
                        reverse_genre_relationship = Relationship(genre_node, "GENRE_OF", movie_node)
                        self.graph.create(reverse_genre_relationship)

                # Create relationships for cast
                for cast_name in movie_info.get("cast", []):
                    cast_node = self.graph.nodes.match(name=cast_name).first()

                    if cast_node:
                        cast_relationship = Relationship(movie_node, "HAS_CAST", cast_node)
                        self.graph.create(cast_relationship)

                        # Bidirectional relationship
                        reverse_cast_relationship = Relationship(cast_node, "CAST_IN", movie_node)
                        self.graph.create(reverse_cast_relationship)

                # Create relationships for directors
                for director_name in movie_info.get("director", []):
                    director_node = self.graph.nodes.match(name=director_name).first()

                    if director_node:
                        director_relationship = Relationship(movie_node, "HAS_DIRECTOR", director_node)
                        self.graph.create(director_relationship)

                        # Bidirectional relationship
                        reverse_director_relationship = Relationship(director_node, "DIRECTED", movie_node)
                        self.graph.create(reverse_director_relationship)

                # Create relationships for writers
                for writer_name in movie_info.get("writer", []):
                    writer_node = self.graph.nodes.match(name=writer_name).first()

                    if writer_node:
                        writer_relationship = Relationship(movie_node, "HAS_WRITER", writer_node)
                        self.graph.create(writer_relationship)

                        # Bidirectional relationship
                        reverse_writer_relationship = Relationship(writer_node, "WRITTEN_BY", movie_node)
                        self.graph.create(reverse_writer_relationship)
                
                # Create relationships for years
                for year in movie_info.get("year", []):
                    year_node = self.graph.nodes.match(release_year=year).first()

                    if year_node:
                        year_relationship = Relationship(movie_node, "RELEASED_IN_YEAR", year_node)
                        self.graph.create(year_relationship)

                        # Bidirectional relationship
                        reverse_year_relationship = Relationship(year_node, "YEAR_OF_RELEASE", movie_node)
                        self.graph.create(reverse_year_relationship)


genre_data = unique_genres_list

people_data = unique_people_list

year_data = unique_years

movie_graph = MovieGraph()
movie_graph.create_movie_nodes(movie_names_data)
movie_graph.create_genre_nodes(genre_data)
movie_graph.create_people_nodes(people_data)
movie_graph.create_year_nodes(year_data)

In [32]:
relationship_data = relations_list
movie_graph.create_relationships(relationship_data)

# Playground

In [9]:
import pandas as pd

# Assuming your CSV file has columns: ID, Movie Name, No of Mentions
file_path = './movies_with_mentions.csv'

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

# Check the actual column names in your DataFrame
print("Column names:", df.columns)

# Check for duplicate entries based on the correct column name
duplicates = df[df.duplicated('movieName', keep=False)]

# Display the duplicate entries
if not duplicates.empty:
    print("Duplicate Movies:")
    print(duplicates)
    
    # Remove the duplicate entries from the DataFrame
    df = df.drop_duplicates('movieName', keep='first')
    
    print("Duplicate movies removed.")
    
    # Save the modified DataFrame back to the CSV file
    df.to_csv(file_path, index=False)
    
    print("CSV file updated.")
else:
    print("No duplicate movies found.")


Column names: Index(['movieId', 'movieName', 'nbMentions'], dtype='object')
No duplicate movies found.


In [18]:
import json

def find_duplicates(json_file):
    # List to store duplicate entries
    duplicates = []

    # Create a dictionary to store entries based on name and year
    entries_dict = {}

    with open(json_file, 'r') as file:
        # Iterate through each line in the file
        for line in file:
            try:
                # Parse each line as JSON
                entry_data = json.loads(line)
                
                # Extract relevant information
                entry_id = entry_data.get("id")
                name = entry_data.get("name")
                year = entry_data.get("year")

                # Create a unique key based on name and year
                key = f"{name}_{year}"

                # Check if the key is already in the dictionary
                if key in entries_dict:
                    # Add the current entry and the existing entry to the duplicates list
                    duplicates.append((entry_id, entries_dict[key]))
                else:
                    # Add the entry to the dictionary
                    entries_dict[key] = entry_id

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")

    return duplicates

# Example usage
json_file_path = './movienames_year_cid.jsonl'
duplicates = find_duplicates(json_file_path)

print(duplicates)

# Print the duplicate entries
for duplicate in duplicates:
    print(f"Duplicate entries: {duplicate}")


[(None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None, None), (None

In [19]:
import json

def read_jsonl_to_dict(jsonl_file):
    data_dict = {}

    with open(jsonl_file, 'r') as file:
        for line in file:
            try:
                entry_data = json.loads(line)
                
                # Assuming each entry has a unique identifier as "id"
                entry_id = entry_data.get("id")

                # Add the entry to the dictionary using the entry_id as the key
                data_dict[entry_id] = entry_data

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")

    return data_dict

# Example usage
jsonl_file_path = './movienames_year_cid.jsonl'
entries_dict = read_jsonl_to_dict(jsonl_file_path)

# Print the dictionary
print(entries_dict)


{None: {'200018': {'name': 'The Force', 'year': None, 'conversationIds': ['19920']}}}


In [20]:
file_path = './movienames_year_cid.jsonl'

# Function to read a JSONL file and return a list of dictionaries
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

In [24]:
movieszz = read_jsonl(file_path)

{'111776': {'name': 'Super Troopers',
  'year': '2001',
  'conversationIds': ['20001',
   '20041',
   '20721',
   '20838',
   '20868',
   '21180',
   '21182',
   '21183',
   '21208',
   '21231',
   '21728',
   '21825',
   '21829',
   '21834',
   '21836',
   '21838',
   '21840',
   '21841',
   '21842',
   '21845',
   '21847',
   '21852',
   '21867',
   '21869',
   '21872',
   '21883',
   '21924',
   '21992',
   '22190',
   '22440',
   '22441',
   '22446',
   '22447',
   '22452',
   '22453',
   '22454',
   '22455',
   '22459',
   '22480',
   '22482',
   '22580',
   '22619',
   '22680',
   '22683',
   '22694',
   '22696',
   '22706',
   '22707',
   '22708',
   '22709',
   '22710',
   '471',
   '871',
   '927',
   '931',
   '1057',
   '3393',
   '3775',
   '3779',
   '3824',
   '5505',
   '5802',
   '5804',
   '5943',
   '6081',
   '6161',
   '6322',
   '7323',
   '7540',
   '8132',
   '9757',
   '10580',
   '10806',
   '11017',
   '12080',
   '12164',
   '12201',
   '12563',
   '13704',
 

In [40]:
unique_values = set()

for i in movieszz:
    for x in i:
        val = str(i[x]['name']) + "$" + str(i[x]['year'])
        unique_values.add(val)




In [48]:
jsonl_file_path = './bin/generated/movies_with_year.jsonl'

with open(jsonl_file_path, 'w') as file:
    for unique_value in unique_values:
        movie_name, movie_year = unique_value.split('$')
        movie_details = {"name": movie_name, "year": movie_year}  # Convert year to integer if needed
        json.dump(movie_details, file)
        file.write('\n')
        

//Code to get Actors Directos and Cast