# IMDB Reviews

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# imdb_reviews = pd.read_csv('https://raw.githubusercontent.com/naserahmadi/TDmatch/main/data/imdb/IMDB_reviews.csv', names = ['text'])
# imdb_reviews['row_number'] = imdb_reviews.reset_index().index
# imdb_reviews.to_csv('data/imdb_raw_reviews.csv')

imdb_raw_reviews = pd.read_csv('data/imdb_raw_reviews.csv')
imdb_raw_reviews.head()

Unnamed: 0,text
0,the shawshank redemption is without a doubt on...
1,the shawshank redemption is written and direct...
2,the godfather is one of the very few films tha...
3,'the godfather' is the pinnacle of flawless fi...
4,dark yes complex ambitious. christopher nol...


In [3]:
imdb_raw_reviews.shape

(500, 1)

In [4]:
imdb_reviews = pd.read_csv('data/imdb_reviews.csv')[['movie', 'user_review_permalink', 'user_review']]
imdb_reviews.head()

Unnamed: 0,movie,user_review_permalink,user_review
0,The Shawshank Redemption,https://www.imdb.com/review/rw3870888/,i watch this movie for minimum 5-6 times first...
1,The Shawshank Redemption,https://www.imdb.com/review/rw2284594/,The Shawshank Redemption is written and direct...
2,The Godfather,https://www.imdb.com/review/rw3205452/,The Godfather is one of the most iconic films ...
3,The Godfather,https://www.imdb.com/review/rw3038370/,'The Godfather' is the pinnacle of flawless fi...
4,The Dark Knight,https://www.imdb.com/review/rw1917099/,"Dark, yes, complex, ambitious. Christopher Nol..."


In [5]:
imdb_reviews.shape

(2000, 3)

# ChatGPT Entity Extraction Testing

In [6]:
# https://github.com/openai/openai-python
import yaml
import openai

with open("openai.yaml", "r") as stream:
    d = yaml.safe_load(stream)

openai.api_key = d['openai']['api-key']

In [8]:
model = 'gpt-3.5-turbo'
prompt = 'Hi'

chatgpt = openai.ChatCompletion()

completion = chatgpt.create(model=model, messages=[{"role": "user", "content": prompt}])
print(completion.choices[0].message.content)

Hello! How can I assist you today?


## Extracting a specific entity from text

In [175]:
import time

# extracts one entity from text 
def get_entity(entity, text):
    
    # chill for a second
    time.sleep(1)
    
    wait = True
    seconds = 2
    retries = 1
    
    messages = [
        {"role": "user", "content": f"Your job is to extract the {entity} being described by the text provided."},
        {"role": "user", "content": text},
        {"role": "user", "content": f"Respond ONLY with the {entity} ONLY {entity}"},
        {"role": "user", "content": f"If {entity} is not found, respond ONLY with Unknown ONLY Unknown"}
    ]

    while wait:
        try:
            response = chatgpt.create(model=model, messages=messages).choices[0].message.content
        except:
            time.sleep(seconds)
            print(f'Retry #{retries}, waiting for an additional {seconds} seconds...')
            retries += 1
            seconds += 1
            
            if retries > 20:
                return 'Unknown'
        else:
            return response

In [176]:
test_review = '''i'm not an al pacino fan but this movie has the best performance of al. i think no one in the history of world cinema has played a character better than al pacino in godfather 2. he deserved an oscar.
godfather 2 is better than the first part. it is a great sequel  it expands the themes and the world of the first part. robert de niro is stunning as usual. missed marlon brando. best part of this movie is it shows the rise of vito corleone and michael corleone contrasting their times and personalities.
you need to watch godfather 2'''

In [177]:
entity_to_extract = "official title of the movie"
get_entity(entity_to_extract, test_review)

'The Godfather: Part II'

In [178]:
entity_to_extract = "official title of the movie"
get_entity(entity_to_extract, "I really liked this movie")

'Unknown'

In [179]:
entity_to_extract = "lead actor of the movie"
get_entity(entity_to_extract, test_review)

'Al Pacino'

In [55]:
entity_to_extract = "all entities, and the type of entity"
print(get_entity(entity_to_extract, test_review))

Entities:
- Al Pacino (Person)
- Movie (Godfather 2) (Product)
- Oscar (Award) 
- Godfather 1 (Product)
- Sequel (Type of Movie)
- Themes (Concept)
- World (Concept)
- Robert De Niro (Person)
- Marlon Brando (Person)
- Rise of Vito Corleone (Concept)
- Michael Corleone (Person)
- Times (Concept)
- Personalities (Concept)

Note: Unknown entities may exist but not provided in the text.


# IMDB Uncertain Reviews After Flair

In [192]:
imdb_uncertain = pd.read_csv('data/imdb_uncertain_after_flair.csv')['id'].sort_values().unique().tolist()

In [193]:
# filter imdb_reviews based on ids in the imdb_uncertain dataframe
ids_filter = imdb_reviews.index.isin(imdb_uncertain)
imdb_reviews_uncertain = imdb_reviews.copy()[ids_filter].reset_index(drop=True)
imdb_reviews_uncertain.head()

Unnamed: 0,movie,user_review_permalink,user_review
0,The Shawshank Redemption,https://www.imdb.com/review/rw3870888/,i watch this movie for minimum 5-6 times first...
1,The Dark Knight,https://www.imdb.com/review/rw1917099/,"Dark, yes, complex, ambitious. Christopher Nol..."
2,The Dark Knight,https://www.imdb.com/review/rw5478826/,"Confidently directed, dark, brooding, and pack..."
3,The Godfather: Part II,https://www.imdb.com/review/rw0135607/,"Coppola's masterpiece is rivaled only by ""The ..."
4,12 Angry Men,https://www.imdb.com/review/rw0059999/,An excellent courtroom drama with a unique twi...


## Test on first 5 columns

In [182]:
test = imdb_reviews.copy().head()

In [183]:
test

Unnamed: 0,movie,user_review_permalink,user_review
0,The Shawshank Redemption,https://www.imdb.com/review/rw3870888/,i watch this movie for minimum 5-6 times first...
1,The Shawshank Redemption,https://www.imdb.com/review/rw2284594/,The Shawshank Redemption is written and direct...
2,The Godfather,https://www.imdb.com/review/rw3205452/,The Godfather is one of the most iconic films ...
3,The Godfather,https://www.imdb.com/review/rw3038370/,'The Godfather' is the pinnacle of flawless fi...
4,The Dark Knight,https://www.imdb.com/review/rw1917099/,"Dark, yes, complex, ambitious. Christopher Nol..."


In [184]:
total = len(test)-1

def get_entity_wrapper(row):
    print(f'Running row {row.name} / {total}')
    return get_entity("official title of the movie", row['user_review'])

In [185]:
total

4

In [186]:
test['gpt_movie_title'] = test.apply(lambda row: get_entity_wrapper(row), axis=1)

Running row 0 / 4
Running row 1 / 4
Running row 2 / 4
Running row 3 / 4
Running row 4 / 4


In [187]:
test

Unnamed: 0,movie,user_review_permalink,user_review,gpt_movie_title
0,The Shawshank Redemption,https://www.imdb.com/review/rw3870888/,i watch this movie for minimum 5-6 times first...,Unknown.
1,The Shawshank Redemption,https://www.imdb.com/review/rw2284594/,The Shawshank Redemption is written and direct...,The Shawshank Redemption
2,The Godfather,https://www.imdb.com/review/rw3205452/,The Godfather is one of the most iconic films ...,The Godfather.
3,The Godfather,https://www.imdb.com/review/rw3038370/,'The Godfather' is the pinnacle of flawless fi...,The Godfather.
4,The Dark Knight,https://www.imdb.com/review/rw1917099/,"Dark, yes, complex, ambitious. Christopher Nol...",The Dark Knight.


## Run on entire dataframe

In [194]:
total = len(imdb_reviews_uncertain)-1

def get_entity_wrapper(row):
    print(f'Running row {row.name} / {total}')
    return get_entity("official title of the movie", row['user_review'])

In [195]:
total

1381

In [196]:
imdb_reviews_uncertain['gpt_movie_title'] = imdb_reviews_uncertain.apply(lambda row: get_entity_wrapper(row), axis=1)

Running row 0 / 1381
Running row 1 / 1381
Running row 2 / 1381
Running row 3 / 1381
Running row 4 / 1381
Running row 5 / 1381
Running row 6 / 1381
Running row 7 / 1381
Running row 8 / 1381
Running row 9 / 1381
Running row 10 / 1381
Running row 11 / 1381
Running row 12 / 1381
Retry #1, waiting for an additional 2 seconds...
Retry #2, waiting for an additional 3 seconds...
Running row 13 / 1381
Retry #1, waiting for an additional 2 seconds...
Retry #2, waiting for an additional 3 seconds...
Retry #3, waiting for an additional 4 seconds...
Retry #4, waiting for an additional 5 seconds...
Running row 14 / 1381
Retry #1, waiting for an additional 2 seconds...
Retry #2, waiting for an additional 3 seconds...
Running row 15 / 1381
Running row 16 / 1381
Running row 17 / 1381
Retry #1, waiting for an additional 2 seconds...
Retry #2, waiting for an additional 3 seconds...
Retry #3, waiting for an additional 4 seconds...
Retry #4, waiting for an additional 5 seconds...
Retry #5, waiting for an a

## GPT Post Processing

In [199]:
imdb_reviews_uncertain.to_csv('data/imdb_after_gpt.csv')

In [214]:
imdb_reviews_uncertain = pd.read_csv('data/imdb_after_gpt.csv').iloc[:, 1:]

In [215]:
imdb_reviews_uncertain.head()

Unnamed: 0,movie,user_review_permalink,user_review,gpt_movie_title
0,The Shawshank Redemption,https://www.imdb.com/review/rw3870888/,i watch this movie for minimum 5-6 times first...,Unknown.
1,The Dark Knight,https://www.imdb.com/review/rw1917099/,"Dark, yes, complex, ambitious. Christopher Nol...",The Dark Knight
2,The Dark Knight,https://www.imdb.com/review/rw5478826/,"Confidently directed, dark, brooding, and pack...",The Dark Knight
3,The Godfather: Part II,https://www.imdb.com/review/rw0135607/,"Coppola's masterpiece is rivaled only by ""The ...","The Godfather, Part II."
4,12 Angry Men,https://www.imdb.com/review/rw0059999/,An excellent courtroom drama with a unique twi...,12 Angry Men


In [216]:
def remove_ending_period(string):
    if string[-1] == '.':
        return string[:-1]
    else:
        return string

In [217]:
imdb_reviews_uncertain['gpt_movie_title'] = imdb_reviews_uncertain['gpt_movie_title'].apply(remove_ending_period)
imdb_reviews_uncertain['gpt_movie_title'] = imdb_reviews_uncertain['gpt_movie_title'].replace('Unknown', 'N/A')

In [218]:
imdb_reviews_uncertain.head()

Unnamed: 0,movie,user_review_permalink,user_review,gpt_movie_title
0,The Shawshank Redemption,https://www.imdb.com/review/rw3870888/,i watch this movie for minimum 5-6 times first...,
1,The Dark Knight,https://www.imdb.com/review/rw1917099/,"Dark, yes, complex, ambitious. Christopher Nol...",The Dark Knight
2,The Dark Knight,https://www.imdb.com/review/rw5478826/,"Confidently directed, dark, brooding, and pack...",The Dark Knight
3,The Godfather: Part II,https://www.imdb.com/review/rw0135607/,"Coppola's masterpiece is rivaled only by ""The ...","The Godfather, Part II"
4,12 Angry Men,https://www.imdb.com/review/rw0059999/,An excellent courtroom drama with a unique twi...,12 Angry Men


In [219]:
imdb_reviews_uncertain.to_csv('data/imdb_after_gpt.csv')

In [220]:
imdb_reviews_uncertain = pd.read_csv('data/imdb_after_gpt.csv').iloc[:, 1:]

# GPT Movie Title Analysis

In [227]:
round(np.sum(imdb_reviews_uncertain['movie'] == imdb_reviews_uncertain['gpt_movie_title'])/len(imdb_reviews_uncertain), 2)

0.53

### There are a lot of instances where gpt got it correct, but the punctuation or capitaliztion is slightly off - needs additional processing to evaluate