In [251]:
import numpy as np
import pandas as pd

In [252]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Data Cleaning

In [253]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [254]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [255]:
new_movies = pd.merge(movies, credits, on='title') # merge on the basis of title
new_movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [256]:
print(movies.shape, credits.shape, new_movies.shape) # since title is not added twice, number of columns reduced by 1

(4803, 20) (4803, 4) (4809, 23)


In [257]:
# Need to make content based recommender system
# Will be done by creating tags
# Remove the columns that will not help in creating tags
# Not using numeric columns as tags cannot be created??
new_movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [258]:
# Columns to keep
# genres, id, keywords, title, overview, cast, crew
# columns to remove
# budget, original_language(skewed towards english language), original_title(names can be other than english), 

In [259]:
new_movies = new_movies[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
new_movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [260]:
# new df will be with 3 columns - movies_id, title, tags
# tags will come by merging overview, genres, keywords, cast, crew
# take overview column and then add all generes and keywords at the last of it
# similarly add top 3 cast and director from crew to overview and make a big paragraph out of it

# Data Preprocessing

### Check null values and duplicates

In [261]:
new_movies.isnull().sum() # 3 rows in overview are empty

id          0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [262]:
new_movies.dropna(inplace=True)
new_movies.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [263]:
new_movies.duplicated().sum() # shows no duplicates

0

### Edit format of entries in columns

In [264]:
import json

In [265]:
new_movies['genres'][0] #list of dicts, need to extract just genres in a list

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [266]:
### Simple Code
# genres = []
# for j in new_movies.genres:
#     temp = eval(j)
#     li = []
#     for i in temp:
#         li.append(i['name'])
#     genres.append(li)
# new_movies['genres'] = genres

In [267]:
# Function to extract particular values from a list of dictionary
# only one list as input and not the entire column
def convert_and_extract(list_of_dicts, key):
    L = []
    for dictionary in eval(list_of_dicts): ## Convert to list 
        L.append(dictionary[key])
    return L

In [268]:
# apply to each column, one by one each list will be given as input to the function
new_movies['genres'] = new_movies['genres'].apply(convert_and_extract, key = 'name') # key is the second input

In [269]:
new_movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [270]:
# FOR KEYWORDS COLUMN
new_movies['keywords'] = new_movies['keywords'].apply(convert_and_extract, key = 'name')
new_movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [271]:
# For CAST COLUMN
new_movies['cast'] = new_movies['cast'].apply(convert_and_extract, key = 'character')
new_movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Jake Sully, Neytiri, Dr. Grace Augustine, Col...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain Jack Sparrow, Will Turner, Elizabeth ...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[James Bond, Blofeld, Madeleine, M, Lucia, Q, ...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Bruce Wayne / Batman, Alfred Pennyworth, Jame...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[John Carter, Dejah Thoris, Sola, Tars Tarkas,...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [272]:
# Take only first three entries
new_movies['cast'] = new_movies['cast'].apply(lambda x: x[:3]) # here each column is the input in the lamda function i.e. x will be entries in the columns and with .apply, one by one each entry is given as input

In [273]:
# For CREW COLUMN
# We want the extract director
# Extract name where job is director
# Each dictionary (in one list) will be having a key job, find the dictionary with job as director and then from that extract name

In [274]:
# For CREW COLUMN
# If the values of one key is equal to what we desire, then extract some other desired value from the dictionary
def convert_and_extract_specific(list_of_dicts, check_key, extract_key, desired_value):
    L = []
    for dictionary in eval(list_of_dicts): ## Convert to list 
        if dictionary[check_key] == desired_value:
            L.append(dictionary[extract_key])
            break # only one dirctor
    return L

In [275]:
# For CREW COLUMN
new_movies['crew'] = new_movies['crew'].apply(convert_and_extract_specific,  args=('job', 'name', 'Director')) # check_key = 'job', extract_key = 'name', desired_value = 'Director'
new_movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Jake Sully, Neytiri, Dr. Grace Augustine]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain Jack Sparrow, Will Turner, Elizabeth ...",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[James Bond, Blofeld, Madeleine]",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Bruce Wayne / Batman, Alfred Pennyworth, Jame...",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[John Carter, Dejah Thoris, Sola]",[Andrew Stanton]


In [276]:
# Change overview entries to list with every word as one entry in the list for concatenating other columns in it
# movies.overview[0].split()
new_movies.overview = new_movies.overview.apply(lambda x: x.split())
new_movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Jake Sully, Neytiri, Dr. Grace Augustine]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Captain Jack Sparrow, Will Turner, Elizabeth ...",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[James Bond, Blofeld, Madeleine]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Bruce Wayne / Batman, Alfred Pennyworth, Jame...",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[John Carter, Dejah Thoris, Sola]",[Andrew Stanton]


In [277]:
def collapse(Li):
    L1 = []
    for i in Li:
        L1.append(i.replace(" ",""))
    return L1

In [278]:
# Need to remove white spaces between words, e.g model might be confused when two directors have same first name as it will create it as tag
new_movies['genres'] = new_movies['genres'].apply(collapse)
new_movies['keywords'] = new_movies['keywords'].apply(collapse)
new_movies['cast'] = new_movies['cast'].apply(collapse)
new_movies['crew'] = new_movies['crew'].apply(collapse)
new_movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[JakeSully, Neytiri, Dr.GraceAugustine]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[CaptainJackSparrow, WillTurner, ElizabethSwann]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[JamesBond, Blofeld, Madeleine]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[BruceWayne/Batman, AlfredPennyworth, JamesGor...",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[JohnCarter, DejahThoris, Sola]",[AndrewStanton]


In [279]:
# Concat all coulmns
new_movies['tags'] = new_movies['overview'] + new_movies['genres'] + new_movies['keywords'] + new_movies['cast'] + new_movies['crew']
new_movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[JakeSully, Neytiri, Dr.GraceAugustine]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[CaptainJackSparrow, WillTurner, ElizabethSwann]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[JamesBond, Blofeld, Madeleine]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[BruceWayne/Batman, AlfredPennyworth, JamesGor...",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[JohnCarter, DejahThoris, Sola]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [280]:
# Get a new df with movie_id, title and tags
new_df = new_movies[['id', 'title', 'tags']]
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [281]:
# Convert tags into a paragraph i.e. list to strings and then make lowercase for ease

# def list_to_string(s):
#     str1 = " "
#     return (str1.join(s))
# new_df['tags'] = new_df['tags'].apply(list_to_string)

# OR
new_df['tags'] = new_df['tags'].apply(lambda x: ' '.join(x))
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: ' '.join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


# Natural Language Processing

In [282]:
# Apply stemming to the text
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer() # create a object
ps.stem('loving')

'love'

In [283]:
# Converts every word into its root word
def stem(text):
    y = []
    # string to list for stemming
    for i in text.split():
        text = ps.stem(i)
        y.append(text)
    # List to string
    string = " ".join(y)
    
    return string
stem('dancing')

'danc'

In [284]:
# new_df['tags'] = new_df['tags'].apply(stem)
# new_df.head()

In [285]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Text preprocessing steps
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation and non-alphanumeric characters
    text = "".join(c for c in text if c.isalnum() or c.isspace())

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Join tokens back into a single string
    preprocessed_text = " ".join(tokens)

    return preprocessed_text

In [286]:
new_df['tags'] = new_df['tags'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(preprocess_text)


In [287]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,22nd centuri parapleg marin dispatch moon pand...
1,285,Pirates of the Caribbean: At World's End,captain barbossa long believ dead come back li...
2,206647,Spectre,cryptic messag bond past send trail uncov sini...
3,49026,The Dark Knight Rises,follow death district attorney harvey dent bat...
4,49529,John Carter,john carter warweari former militari captain w...


### Vectorization

In [288]:
from sklearn.feature_extraction.text import CountVectorizer

#### Bag of Words

In [289]:
# Combine all tags into one CORPUS and find n most repeated words and avoid stop words in english language
cv = CountVectorizer(max_features = 10000, stop_words = 'english')

In [290]:
# From the CORPUS, create a count for each word in each tag
vectors = cv.fit_transform(new_df['tags'])

In [291]:
vectors = vectors.toarray()
vectors
 # we get a sparse matrix as many words might be unique for each tags

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#### Tf-dif

In [292]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfd = TfidfVectorizer(use_idf = True, lowercase = True, strip_accents='ascii',stop_words='english')

In [293]:
tf_vectors = tfd.fit_transform(new_df['tags'])

In [294]:
tf_vectors.toarray()
tf_vectors

<4806x35157 sparse matrix of type '<class 'numpy.float64'>'
	with 183061 stored elements in Compressed Sparse Row format>

# Defining Similarity

In [295]:
# Find Cosine similarity (inverse of Cosine distance, between 0 - 1, more value more similar)
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(tf_vectors) # Distance/Similarity of one movie with every other movie

In [296]:
temp = similarity[0]

In [297]:
# Sorting the distances list while preserveing the index (for recommend(movie) function)
# Use this index(indices) to go in the new_def to get movie names
sort_values = sorted((enumerate(temp)), reverse = True) # enumerate helps to iterate while tracking index and sort on the basis of key
# need to sort on the basis of 2nd value of every tuple of the list
sorted((enumerate(temp)), reverse = True, key = lambda x:x[1])[1:6]

[(2409, 0.1538078014636834),
 (3608, 0.11530626717607635),
 (778, 0.11381570718706749),
 (539, 0.10870265165981756),
 (1204, 0.10811373889792175)]

In [298]:
# Getting index (for recommend(movie) function)
new_df[new_df['title'] == 'Batman Begins'].index[0]

119

In [299]:
def recommend(movie):
    # Find index of the movie to get appropriate vector in the similarity matrix
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    sort_values = sorted((enumerate(distances)), reverse = True, key = lambda x:x[1])
    
    # get top x similar (1st one is the movie itself)
    for i in sort_values[1:21]:
        index = i[0]
        print(new_df.iloc[i[0]].title)
                    

In [300]:
recommend('Avengers: Age of Ultron')

The Avengers
Captain America: Civil War
Iron Man 2
Iron Man
Iron Man 3
Thor
Captain America: The Winter Soldier
X-Men Origins: Wolverine
The Wolverine
Fantastic Four
Man of Steel
X-Men: Apocalypse
The Helix... Loaded
Guardians of the Galaxy
Ant-Man
Captain America: The First Avenger
X-Men
X-Men: Days of Future Past
Superman II
The Incredible Hulk


# Scraping Reviews

In [340]:
import requests

api_key = "8265bd1679663a7ea12ac168da84d2e8&language=en-US&page=1"
def get_movie_reviews(movie_name):
    api_key = "8265bd1679663a7ea12ac168da84d2e8&language=en-US&page=1" 
    search_url = f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={movie_name}"

    response = requests.get(search_url)
    results = response.json()["results"]

    if not results:
        print("No movie found.")
        return []

    # Get the movie ID of the first search result
    movie_id = results[0]["id"]

    reviews_url = f"https://api.themoviedb.org/3/movie/{movie_id}/reviews?api_key={api_key}"

    # Send a GET request to retrieve the movie reviews
    response = requests.get(reviews_url)
    reviews = response.json()["results"]

    if not reviews:
        return 0

    review_texts = [review["content"] for review in reviews]
    
    new_review = []
    
    for i in review_texts:
        temp = preprocess_text(i)
        new_review.append(temp)
        
    return new_review

In [341]:
get_movie_reviews('Spiderman 3')

['success first two spiderman film spiderman 3 repres sam raimi inabl overcom biggest hurdl third movi hurdl alien 3 godfath part 3 robocop 3 list goe could studio interfer although wish would thing wrong spiderman 3 almost everi storytel decis creation sandman peter instantli fall love new costum thing never actual call venom plain stupid right get go hate peter parker studio interfer mandat jazz bar danc scene green goblin 2 storylin make absolut sens whatsoev creat stupid love triangl gwen mari jane betray everyth peter parker suppos fundament issu script direct film took issu doom film failur that movi suck butcher venom',
 'movi spidey reveng transform bad spidey dark feel found one appeal factor movi weve gotten use peter parker friendli neighborhood spiderman turn dark side time watch get whole new experi',
 'tobey maguir make great spiderman hand weve seen best movi saw worst although must say wasnt big fan cheesi look emo cut still enjoy watch beat venom',
 'much emot movi way

# Sentiment Analysis

In [342]:
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [343]:
# Dataset for training and testing
dataset = pd.read_csv('reviews.txt',sep = '\t', names =['Reviews','Comments'])
dataset.head()

Unnamed: 0,Reviews,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [344]:
# Smaller Dataset
dataset['Comments'] = dataset['Comments'].apply(preprocess_text)
dataset.head()

Unnamed: 0,Reviews,Comments
0,1,da vinci code book awesom
1,1,first clive cussler ive ever read even book li...
2,1,like da vinci code lot
3,1,like da vinci code lot
4,1,like da vinci code ultimatli didnt seem hold


In [345]:
vectorizer = TfidfVectorizer(use_idf = True, lowercase = True, strip_accents='ascii',stop_words='english')

In [346]:
# X = vectorizer.fit_transform(df.text)
# y = df.sentiment
X = vectorizer.fit_transform(dataset.Comments)
y = dataset.Reviews
# X_train, X_test, y_train, y_test = train_test_split(X, y)

In [347]:
best_accuracy = 0.0
best_seed = None

# Multiple train-test splits with different random seeds
for seed in range(5):  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)
    clf = naive_bayes.MultinomialNB()
    clf.fit(X_train, y_train)
    accuracy = accuracy_score(y_test, clf.predict(X_test)) * 100
    
    # Update the best accuracy and seed if a higher accuracy is achieved
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_seed = seed

print("Best accuracy:", best_accuracy)
print("Seed for best accuracy:", best_seed)

Best accuracy: 98.55491329479769
Seed for best accuracy: 3


In [348]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = best_seed)
clf = naive_bayes.MultinomialNB()
clf.fit(X_train,y_train)

In [349]:
accuracy_score(y_test,clf.predict(X_test))*100

98.55491329479769

In [350]:
# Model Evaluation
y_pred = clf.predict(X_test)

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(cm)


Precision: 0.9855530160687619
Recall: 0.9855491329479769
F1 Score: 0.9855426463530568
Confusion Matrix:
[[577  12]
 [  8 787]]


### Predict and suggest overall quality of the movie

In [351]:
# Function to predict new instances
def predict_new_instance(new_instance):
    # Preprocess the new instance and extract features
    preprocess_text(new_instance)
    new_instance_vector = vectorizer.transform([new_instance])
    
    # Predict the class of the new instance
    predicted_class = clf.predict(new_instance_vector)

    return predicted_class[0]

In [352]:
def reviews(movie_name):
    # Reviews
    reviews = get_movie_reviews(movie_name) 
    review_class = []
    for i in reviews:
        predicted_class = predict_new_instance(i) 
        review_class.append(predicted_class)

    avg_review = np.average(review_class)

    if avg_review >= 0.7:
        return 'Overall Very Good Movie'
    elif avg_review < 0.7 and avg_review >= 0.4:
        return 'Overall Good Movie'
    elif avg_review < 0.4 and avg_review >= 0.2:
        return 'Overall Average Movie'
    else:
        return'Bad Movie'  

In [353]:
reviews = get_movie_reviews('Spiderman 3')
review_count = len(reviews)
review_count

20

In [371]:
def recommend_with_reviews(movie, min_review = 2):
    # Find index of the movie to get appropriate vector in the similarity matrix
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    sort_values = sorted(enumerate(distances), reverse=True, key=lambda x: x[1])
    
    recommended_movies = []
    
    # Get top 5 similar movies (excluding the movie itself)
    for i in sort_values[1:21]:
        index = i[0]
        recommended_movie = new_df.iloc[index].title
        
        # Get reviews for the recommended movie
        reviews = get_movie_reviews(recommended_movie) 
        if reviews == 0 or len(reviews) <= min_review:
            continue
        review_class = []
        for review in reviews:
            predicted_class = predict_new_instance(review)
            review_class.append(predicted_class)

        avg_review = np.average(review_class)

        if avg_review >= 0.7:
            overall_review = 'Very Good'
        elif avg_review < 0.7 and avg_review >= 0.3:
            overall_review = 'Good'
        else:
            overall_review = 'Average'

        
        recommended_movies.append({'Movie': recommended_movie, 'Overall Review': overall_review, 'Review_Score': avg_review})
        
    recommended_df = pd.DataFrame(recommended_movies)
    recommended_df = recommended_df.sort_values(by='Review_Score', ascending=False)
    recommended_df.reset_index(drop=True, inplace=True)
    return recommended_df[0:5]


In [372]:
recommend_with_reviews('Batman')

Unnamed: 0,Movie,Overall Review,Review_Score
0,Batman Returns,Very Good,1.0
1,Man of Steel,Very Good,0.833333
2,Superman,Very Good,0.75
3,The Dark Knight Rises,Very Good,0.727273
4,Batman v Superman: Dawn of Justice,Good,0.692308
