<a href="https://colab.research.google.com/github/slin35/RobotProducer/blob/main/lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
!pip install transformers



In [74]:
from transformers import pipeline

# predict the title of the movie
def get_title(overview: str):

  question_answerer = pipeline('question-answering')
  answer = question_answerer({
    'question': 'What is the name of the movie ?',
    'context': overview
  })

  return answer['answer']


In [75]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import os
import collections
from datetime import datetime

In [76]:
def get_data():
  # import data
  data1 = pd.read_csv('./tmdb_5000_movies.csv')
  data2 = pd.read_csv('./tmdb_5000_credits.csv')

  data1 = data1[['genres', 'title', 'overview', 'id']]
  data2 = data2[['movie_id', 'title', 'cast', 'crew']]

  data = pd.merge(data1, data2, left_on='id', right_on='movie_id', how='inner')
  data = data[['id', 'title_x', 'overview', 'genres', 'cast', 'crew']]
  data.rename({'title_x': 'title'}, axis=1, inplace=True)

  # cleanup genres
  data['genres'] = [json.loads(i) if i != [] else [] for i in data['genres']]
  data['genres'] = [[j['name'] if 'name' in j else [] for j in i] for i in data['genres']]

  # cleanup cast
  data['cast'] = [json.loads(i) if i != [] else [] for i in data['cast']]
  data['cast'] = [[j['name'] if 'name' in j else [] for j in i] for i in data['cast']]

  # cleanup crew
  data['crew'] = [json.loads(i) if i != [] else [] for i in data['crew']]
  data['crew'] = [[j['name']  for j in i if 'job' in j and j['job'] == 'Director'] for i in data['crew']]

  # get rid of rows with empty values in overview, genres, cast, crew
  data.replace("", float("NaN"), inplace=True)
  data.dropna(inplace=True)
  data = data[~data.genres.str.len().eq(0)]
  data = data[~data.cast.str.len().eq(0)]
  data = data[~data.crew.str.len().eq(0)]

  return data

In [77]:
def get_unique(data: pd.DataFrame, label: str):
  result = []
  [[result.append(item) for item in row if item not in result] for row in data[label]]
  return result

In [78]:
data = get_data()

# get unique genres
genres = get_unique(data, 'genres')

# get unique directors
directors = get_unique(data, 'crew')

# get unique cast members
cast = get_unique(data, 'cast')

In [81]:
data.head()

Unnamed: 0,id,title,overview,genres,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]
