In [None]:
#!pip install pycountry
#!pip install torch
#!pip install transformers
#!pip install xformers

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pycountry

import torch
from transformers import pipeline
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, RobertaForSequenceClassification, AutoModelForSequenceClassification
from scipy.special import softmax

import warnings
warnings.filterwarnings("ignore")

Dataset is from Kaggle: https://www.kaggle.com/datasets/dgoenrique/amazon-prime-movies-and-tv-shows

In [2]:
amazon_df = pd.read_csv("titles.csv")

In [3]:
amazon_df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,tm87233,It's a Wonderful Life,MOVIE,A holiday favourite for generations... George...,1946,PG,130,"['drama', 'family', 'fantasy', 'romance', 'com...",['US'],,tt0038650,8.6,467766.0,27.611,8.261
1,tm143047,Duck Soup,MOVIE,Rufus T. Firefly is named president/dictator o...,1933,,69,"['comedy', 'war']",['US'],,tt0023969,7.8,60933.0,9.013,7.357
2,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['drama', 'romance', 'comedy']",['US'],,tt0032599,7.8,60244.0,14.759,7.433
3,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family']",['US'],26.0,tt0850645,8.5,1149.0,15.424,7.6
4,tm5012,Red River,MOVIE,Headstrong Thomas Dunson starts a thriving Tex...,1948,,133,"['western', 'drama', 'romance', 'action']",['US'],,tt0040724,7.8,32210.0,12.4,7.4


In [4]:
amazon_df.shape

(10873, 15)

In [5]:
amazon_df.isnull().sum()

id                         0
title                      0
type                       0
description              144
release_year               0
age_certification       7185
runtime                    0
genres                     0
production_countries       0
seasons                 9322
imdb_id                  701
imdb_score              1108
imdb_votes              1120
tmdb_popularity          571
tmdb_score              2126
dtype: int64

In [6]:
# get null percentage of each column
null_rate_dict = {}

for i in amazon_df.columns:
    null_rate = amazon_df[i].isna().sum() / len(amazon_df) * 100
    null_rate_dict[i] = null_rate

In [7]:
# sort the null percentage from highest to lowest
sorted_null_rate = sorted(null_rate_dict.items(), key=lambda x:x[1], reverse = True)
print(sorted_null_rate)

[('seasons', 85.73530764278487), ('age_certification', 66.08111836659616), ('tmdb_score', 19.55302124528649), ('imdb_votes', 10.300744964591189), ('imdb_score', 10.19037983997057), ('imdb_id', 6.4471626965878785), ('tmdb_popularity', 5.251540513197829), ('description', 1.3243814954474387), ('id', 0.0), ('title', 0.0), ('type', 0.0), ('release_year', 0.0), ('runtime', 0.0), ('genres', 0.0), ('production_countries', 0.0)]


In [8]:
# only print out columns with null percentage
for i in sorted_null_rate:
    if i[1] > 0:
        print(f"{i[0]} null percentage: {round(i[1], 2)}%")

seasons null percentage: 85.74%
age_certification null percentage: 66.08%
tmdb_score null percentage: 19.55%
imdb_votes null percentage: 10.3%
imdb_score null percentage: 10.19%
imdb_id null percentage: 6.45%
tmdb_popularity null percentage: 5.25%
description null percentage: 1.32%


In [9]:
amazon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10873 entries, 0 to 10872
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10873 non-null  object 
 1   title                 10873 non-null  object 
 2   type                  10873 non-null  object 
 3   description           10729 non-null  object 
 4   release_year          10873 non-null  int64  
 5   age_certification     3688 non-null   object 
 6   runtime               10873 non-null  int64  
 7   genres                10873 non-null  object 
 8   production_countries  10873 non-null  object 
 9   seasons               1551 non-null   float64
 10  imdb_id               10172 non-null  object 
 11  imdb_score            9765 non-null   float64
 12  imdb_votes            9753 non-null   float64
 13  tmdb_popularity       10302 non-null  float64
 14  tmdb_score            8747 non-null   float64
dtypes: float64(5), int6

In [10]:
round(amazon_df.describe(), 2)

Unnamed: 0,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
count,10873.0,10873.0,1551.0,9765.0,9753.0,10302.0,8747.0
mean,2004.08,85.87,2.66,5.97,8973.23,7.61,5.98
std,24.88,34.16,3.72,1.36,48977.67,45.85,1.51
min,1912.0,0.0,1.0,1.1,5.0,0.0,0.5
25%,2002.0,65.0,1.0,5.1,119.0,1.33,5.07
50%,2015.0,89.0,1.0,6.1,488.0,2.66,6.0
75%,2019.0,102.0,3.0,7.0,2493.0,6.18,6.98
max,2023.0,940.0,53.0,9.9,2081757.0,3187.53,10.0


In [11]:
# all the null seasons are movie type
amazon_df[amazon_df['seasons'].isna()]['type'].unique()

array(['MOVIE'], dtype=object)

In [12]:
# fill the null values of seasons to zero
amazon_df['seasons'] = amazon_df['seasons'].fillna(0)

In [13]:
# get the first genre from genres as our primary genre
amazon_df['primary_genre'] = amazon_df['genres'].apply(lambda x: x.strip("[ ").strip("]").split(",")[0])
amazon_df['primary_genre'] = amazon_df['primary_genre'].str.replace("'", "")
amazon_df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,primary_genre
0,tm87233,It's a Wonderful Life,MOVIE,A holiday favourite for generations... George...,1946,PG,130,"['drama', 'family', 'fantasy', 'romance', 'com...",['US'],0.0,tt0038650,8.6,467766.0,27.611,8.261,drama
1,tm143047,Duck Soup,MOVIE,Rufus T. Firefly is named president/dictator o...,1933,,69,"['comedy', 'war']",['US'],0.0,tt0023969,7.8,60933.0,9.013,7.357,comedy
2,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['drama', 'romance', 'comedy']",['US'],0.0,tt0032599,7.8,60244.0,14.759,7.433,drama
3,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family']",['US'],26.0,tt0850645,8.5,1149.0,15.424,7.6,comedy
4,tm5012,Red River,MOVIE,Headstrong Thomas Dunson starts a thriving Tex...,1948,,133,"['western', 'drama', 'romance', 'action']",['US'],0.0,tt0040724,7.8,32210.0,12.4,7.4,western


In [14]:
amazon_df['production_countries'].unique()

array(["['US']", "['GB']", "['SU']", "['DE', 'GB']", "['DE', 'US']",
       "['MX']", "['CA', 'US']", "['US', 'CA']", "['IN']", '[]', "['SE']",
       "['IT', 'US']", "['GB', 'US', 'PA', 'ES']", "['MA', 'GB']",
       "['IT', 'FR']", "['HK']", "['FR', 'DE', 'IT']",
       "['US', 'FR', 'DE']", "['DE']", "['GB', 'US']", "['CA']",
       "['IT', 'US', 'ES']", "['JP']", "['IT', 'GB', 'US']",
       "['CA', 'GB']", "['IT', 'ES']", "['DE', 'IT']", "['IT']",
       "['US', 'NL']", "['US', 'GB']", "['PR', 'US']", "['US', 'MX']",
       "['FR']", "['US', 'DE']", "['CA', 'FR']", "['ES']", "['US', 'PR']",
       "['FR', 'IT']", "['ES', 'IT']", "['IE', 'US']", "['PH', 'US']",
       "['CH', 'DE']", "['RO']", "['US', 'FR']", "['YU', 'XC']",
       "['US', 'SU']", "['JP', 'US']", "['FR', 'US']",
       "['ES', 'GB', 'US']", "['DE', 'LI', 'IT', 'ES']", "['KR', 'US']",
       "['MX', 'IT']", "['GB', 'FR']", "['DE', 'IT', 'FR']",
       "['KR', 'JP']", "['YU', 'US']", "['US', 'IT']", "['IT', 'CA']",
 

In [15]:
# get the primary country code from the list as production country
amazon_df['production_country'] = amazon_df['production_countries'].apply(lambda x: x.strip("[ ").strip("]").split(",")[0])
amazon_df['production_country'] = amazon_df['production_country'].str.replace("'", "")
amazon_df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,primary_genre,production_country
0,tm87233,It's a Wonderful Life,MOVIE,A holiday favourite for generations... George...,1946,PG,130,"['drama', 'family', 'fantasy', 'romance', 'com...",['US'],0.0,tt0038650,8.6,467766.0,27.611,8.261,drama,US
1,tm143047,Duck Soup,MOVIE,Rufus T. Firefly is named president/dictator o...,1933,,69,"['comedy', 'war']",['US'],0.0,tt0023969,7.8,60933.0,9.013,7.357,comedy,US
2,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['drama', 'romance', 'comedy']",['US'],0.0,tt0032599,7.8,60244.0,14.759,7.433,drama,US
3,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family']",['US'],26.0,tt0850645,8.5,1149.0,15.424,7.6,comedy,US
4,tm5012,Red River,MOVIE,Headstrong Thomas Dunson starts a thriving Tex...,1948,,133,"['western', 'drama', 'romance', 'action']",['US'],0.0,tt0040724,7.8,32210.0,12.4,7.4,western,US


In [16]:
# print unique values of country code from the production country list
abbreviated_countries = amazon_df['production_country']
abbreviated_countries.unique()

array(['US', 'GB', 'SU', 'DE', 'MX', 'CA', 'IN', '', 'SE', 'IT', 'MA',
       'HK', 'FR', 'JP', 'PR', 'ES', 'IE', 'PH', 'CH', 'RO', 'YU', 'KR',
       'TW', 'MC', 'IL', 'EG', 'RU', 'AU', 'NZ', 'AT', 'ZA', 'LI', 'BR',
       'DK', 'IR', 'PT', 'BE', 'FI', 'AR', 'NL', 'ID', 'VE', 'XC', 'NO',
       'BG', 'CN', 'TH', 'CL', 'LU', 'DO', 'CO', 'AE', 'PL', 'SK', 'JM',
       'IS', 'RS', 'CZ', 'SG', 'GR', 'HU', 'LT', 'UA', 'KZ', 'NG', 'UY',
       'MT', 'AF', 'TR', 'VN', 'KE', 'BO', 'GE', 'TT', 'BY', 'MY', 'PK',
       'PE', 'XK', 'TC', 'LB', 'NP', 'MN', 'NI', 'SB', 'CR', 'PA', 'UZ',
       'NA', 'KH', 'QA', 'CM', 'EE', 'CU', 'LV', 'SY', 'BM', 'IO', 'PF',
       'HN', 'TN', 'EC', 'BD', 'MK'], dtype=object)

In [17]:
full_countries = []
# for each production country code, get the full country name and store in a list
for abbreviation in abbreviated_countries:
    try:
        country = pycountry.countries.get(alpha_2 = abbreviation).name
        full_countries.append(country)
    except:
        full_countries.append(abbreviation)

In [18]:
len(full_countries)

10873

In [19]:
# create a new column with the full country name
amazon_df['full_country_name'] = full_countries
amazon_df

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,primary_genre,production_country,full_country_name
0,tm87233,It's a Wonderful Life,MOVIE,A holiday favourite for generations... George...,1946,PG,130,"['drama', 'family', 'fantasy', 'romance', 'com...",['US'],0.0,tt0038650,8.6,467766.0,27.611,8.261,drama,US,United States
1,tm143047,Duck Soup,MOVIE,Rufus T. Firefly is named president/dictator o...,1933,,69,"['comedy', 'war']",['US'],0.0,tt0023969,7.8,60933.0,9.013,7.357,comedy,US,United States
2,tm83884,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,"['drama', 'romance', 'comedy']",['US'],0.0,tt0032599,7.8,60244.0,14.759,7.433,drama,US,United States
3,ts20945,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,"['comedy', 'family']",['US'],26.0,tt0850645,8.5,1149.0,15.424,7.600,comedy,US,United States
4,tm5012,Red River,MOVIE,Headstrong Thomas Dunson starts a thriving Tex...,1948,,133,"['western', 'drama', 'romance', 'action']",['US'],0.0,tt0040724,7.8,32210.0,12.400,7.400,western,US,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10868,tm1292884,Hunt,MOVIE,"Arjun is the ACP of Hyderabad, who gets involv...",2023,,129,"['action', 'thriller', 'crime']",['IN'],0.0,tt21946058,5.8,1269.0,4.403,4.300,action,IN,India
10869,tm1284753,Ennalum Ente Aliya,MOVIE,It tells the story of a teen girl from a Musli...,2023,,113,"['drama', 'comedy']",['IN'],0.0,tt23805304,5.6,812.0,3.406,8.000,drama,IN,India
10870,tm1303380,Anubhav Singh Bassi: Bas Kar Bassi,MOVIE,"Fresh out of national law university, Bassi ar...",2023,,83,['comedy'],[],0.0,tt26548127,8.3,10.0,1.960,,comedy,,
10871,tm1307408,Pinkfong! Summer in Wonderville,MOVIE,It’s a hot summer day in Wonderville! Let’s jo...,2023,,47,[],[],0.0,,,,1.336,,,,


# Sentiment Analysis on Description

In [20]:
example = amazon_df['description'][0]
example

"A holiday favourite for generations...  George Bailey has spent his entire life giving to the people of Bedford Falls.  All that prevents rich skinflint Mr. Potter from taking over the entire town is George's modest building and loan company.  But on Christmas Eve the business's $8,000 is lost and George's troubles begin."

Hugging Face, RoBERTa: https://huggingface.co/docs/transformers/main/en/model_doc/roberta#transformers.RobertaForSequenceClassification

roberta-base-go_emotions: https://huggingface.co/SamLowe/roberta-base-go_emotions

cardiffnlp/twitter-roberta-base-sentiment-latest: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

In [21]:
# select and define pretrained roberta model
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
roberta_model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The softmax function transforms each element of a collection by computing the exponential of each element divided by the sum of the exponentials of all the elements. The softmax function is the gradient of logsumexp.

In [22]:
# run roberta model for one example
encoded_text = tokenizer(example, return_tensors = 'pt')
output = roberta_model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
}
print(scores_dict)

{'roberta_neg': 0.04181425, 'roberta_neu': 0.25599903, 'roberta_pos': 0.7021867}


In [23]:
# need to make sure that descriptions are in string data type 
amazon_df['description'] = amazon_df['description'].astype('str')
amazon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10873 entries, 0 to 10872
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10873 non-null  object 
 1   title                 10873 non-null  object 
 2   type                  10873 non-null  object 
 3   description           10873 non-null  object 
 4   release_year          10873 non-null  int64  
 5   age_certification     3688 non-null   object 
 6   runtime               10873 non-null  int64  
 7   genres                10873 non-null  object 
 8   production_countries  10873 non-null  object 
 9   seasons               10873 non-null  float64
 10  imdb_id               10172 non-null  object 
 11  imdb_score            9765 non-null   float64
 12  imdb_votes            9753 non-null   float64
 13  tmdb_popularity       10302 non-null  float64
 14  tmdb_score            8747 non-null   float64
 15  primary_genre      

In [24]:
sentiments = {}

# loop over each description text to determine sentiment score
for i, row in tqdm(amazon_df.iterrows(), total = len(amazon_df)):
    try:
        text = row['description']
        row_id = row['id']
        encoded_text = tokenizer(text, return_tensors = 'pt')
        output = roberta_model(**encoded_text)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        scores_dict = {
            'roberta_neg': scores[0],
            'roberta_neu': scores[1],
            'roberta_pos': scores[2]
        }
        sentiments[row_id] = scores_dict
    except:
        print(f"Broke for id {row_id}")

  0%|          | 0/10873 [00:00<?, ?it/s]

Broke for id ts69050


In [27]:
sentiments

{'tm87233': {'roberta_neg': 0.04181425,
  'roberta_neu': 0.25599903,
  'roberta_pos': 0.7021867},
 'tm143047': {'roberta_neg': 0.6480922,
  'roberta_neu': 0.3373527,
  'roberta_pos': 0.014555118},
 'tm83884': {'roberta_neg': 0.19172105,
  'roberta_neu': 0.77921367,
  'roberta_pos': 0.029065171},
 'ts20945': {'roberta_neg': 0.02046961,
  'roberta_neu': 0.7627171,
  'roberta_pos': 0.21681327},
 'tm5012': {'roberta_neg': 0.014324536,
  'roberta_neu': 0.7588431,
  'roberta_pos': 0.22683235},
 'ts37076': {'roberta_neg': 0.0098324185,
  'roberta_neu': 0.28311232,
  'roberta_pos': 0.7070552},
 'tm82253': {'roberta_neg': 0.030276185,
  'roberta_neu': 0.2799291,
  'roberta_pos': 0.6897948},
 'tm88469': {'roberta_neg': 0.13761096,
  'roberta_neu': 0.6857641,
  'roberta_pos': 0.17662503},
 'tm82560': {'roberta_neg': 0.6500229,
  'roberta_neu': 0.3366609,
  'roberta_pos': 0.013316025},
 'tm160494': {'roberta_neg': 0.13265468,
  'roberta_neu': 0.8228714,
  'roberta_pos': 0.04447388},
 'tm146745': {

In [26]:
result_df = pd.DataFrame(sentiments)
result_df

Unnamed: 0,tm87233,tm143047,tm83884,ts20945,tm5012,ts37076,tm82253,tm88469,tm82560,tm160494,...,tm811070,ts360904,ts382337,tm1305701,ts199693,tm1292884,tm1284753,tm1303380,tm1307408,tm1297385
roberta_neg,0.041814,0.648092,0.191721,0.02047,0.014325,0.009832,0.030276,0.137611,0.650023,0.132655,...,0.57996,0.003586,0.074684,0.093611,0.083413,0.568865,0.06071,0.060881,0.001599,0.02873
roberta_neu,0.255999,0.337353,0.779214,0.762717,0.758843,0.283112,0.279929,0.685764,0.336661,0.822871,...,0.386873,0.065356,0.685402,0.680865,0.687293,0.412574,0.886795,0.573296,0.018359,0.589496
roberta_pos,0.702187,0.014555,0.029065,0.216813,0.226832,0.707055,0.689795,0.176625,0.013316,0.044474,...,0.033166,0.931058,0.239914,0.225524,0.229294,0.018561,0.052495,0.365823,0.980042,0.381775


In [28]:
sentiment_df = result_df.T
sentiment_df

Unnamed: 0,roberta_neg,roberta_neu,roberta_pos
tm87233,0.041814,0.255999,0.702187
tm143047,0.648092,0.337353,0.014555
tm83884,0.191721,0.779214,0.029065
ts20945,0.020470,0.762717,0.216813
tm5012,0.014325,0.758843,0.226832
...,...,...,...
tm1292884,0.568865,0.412574,0.018561
tm1284753,0.060710,0.886795,0.052495
tm1303380,0.060881,0.573296,0.365823
tm1307408,0.001599,0.018359,0.980042


In [29]:
sentiment_df = sentiment_df.reset_index().rename(columns = {'index': 'id'})
sentiment_df

Unnamed: 0,id,roberta_neg,roberta_neu,roberta_pos
0,tm87233,0.041814,0.255999,0.702187
1,tm143047,0.648092,0.337353,0.014555
2,tm83884,0.191721,0.779214,0.029065
3,ts20945,0.020470,0.762717,0.216813
4,tm5012,0.014325,0.758843,0.226832
...,...,...,...,...
10867,tm1292884,0.568865,0.412574,0.018561
10868,tm1284753,0.060710,0.886795,0.052495
10869,tm1303380,0.060881,0.573296,0.365823
10870,tm1307408,0.001599,0.018359,0.980042


In [30]:
amazon_df = pd.merge(sentiment_df, amazon_df, on = 'id', how = 'left')
amazon_df

Unnamed: 0,id,roberta_neg,roberta_neu,roberta_pos,title,type,description,release_year,age_certification,runtime,...,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,primary_genre,production_country,full_country_name
0,tm87233,0.041814,0.255999,0.702187,It's a Wonderful Life,MOVIE,A holiday favourite for generations... George...,1946,PG,130,...,['US'],0.0,tt0038650,8.6,467766.0,27.611,8.261,drama,US,United States
1,tm143047,0.648092,0.337353,0.014555,Duck Soup,MOVIE,Rufus T. Firefly is named president/dictator o...,1933,,69,...,['US'],0.0,tt0023969,7.8,60933.0,9.013,7.357,comedy,US,United States
2,tm83884,0.191721,0.779214,0.029065,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,92,...,['US'],0.0,tt0032599,7.8,60244.0,14.759,7.433,drama,US,United States
3,ts20945,0.020470,0.762717,0.216813,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,19,...,['US'],26.0,tt0850645,8.5,1149.0,15.424,7.600,comedy,US,United States
4,tm5012,0.014325,0.758843,0.226832,Red River,MOVIE,Headstrong Thomas Dunson starts a thriving Tex...,1948,,133,...,['US'],0.0,tt0040724,7.8,32210.0,12.400,7.400,western,US,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10867,tm1292884,0.568865,0.412574,0.018561,Hunt,MOVIE,"Arjun is the ACP of Hyderabad, who gets involv...",2023,,129,...,['IN'],0.0,tt21946058,5.8,1269.0,4.403,4.300,action,IN,India
10868,tm1284753,0.060710,0.886795,0.052495,Ennalum Ente Aliya,MOVIE,It tells the story of a teen girl from a Musli...,2023,,113,...,['IN'],0.0,tt23805304,5.6,812.0,3.406,8.000,drama,IN,India
10869,tm1303380,0.060881,0.573296,0.365823,Anubhav Singh Bassi: Bas Kar Bassi,MOVIE,"Fresh out of national law university, Bassi ar...",2023,,83,...,[],0.0,tt26548127,8.3,10.0,1.960,,comedy,,
10870,tm1307408,0.001599,0.018359,0.980042,Pinkfong! Summer in Wonderville,MOVIE,It’s a hot summer day in Wonderville! Let’s jo...,2023,,47,...,[],0.0,,,,1.336,,,,


In [31]:
# roberta-base-go_emotions transformer model for one example
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
model = RobertaForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")

inputs = tokenizer(example, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# print the first emotion label with the highest probablity
predicted_class_id = logits.argmax().item()
print(model.config.id2label[predicted_class_id])

sadness


In [32]:
emotion_sentiments = {}

# tqdm library is used to make progress bar for the below operation
for i, row in tqdm(amazon_df.iterrows(), total = len(amazon_df)):
    try:
        # make emotion sentiment prediction for each description
        text = row['description']
        row_id = row['id']
        inputs = tokenizer(text, return_tensors="pt")

        with torch.no_grad():
            logits = model(**inputs).logits

        predicted_class_id = logits.argmax().item()
        emotion_sentiments[row_id] = model.config.id2label[predicted_class_id]
    except:
        print(row_id, "Error Occured")

  0%|          | 0/10872 [00:00<?, ?it/s]

In [33]:
print(len(emotion_sentiments))

10872


In [34]:
emotion_sentiments.keys()

dict_keys(['tm87233', 'tm143047', 'tm83884', 'ts20945', 'tm5012', 'ts37076', 'tm82253', 'tm88469', 'tm82560', 'tm160494', 'tm146745', 'tm19248', 'tm97735', 'tm116781', 'tm83723', 'tm112424', 'tm120863', 'tm100333', 'tm17025', 'tm19424', 'tm74259', 'tm155610', 'tm63937', 'tm18385', 'tm89268', 'tm2838', 'tm5096', 'tm113731', 'tm127199', 'tm154', 'tm111987', 'tm74984', 'tm64852', 'tm264908', 'tm85417', 'tm78387', 'tm16141', 'tm67012', 'tm274240', 'tm137796', 'tm1227', 'tm131025', 'tm110888', 'tm119274', 'tm118318', 'tm53452', 'tm82637', 'tm116744', 'tm164354', 'tm6408', 'tm997', 'tm112135', 'tm7905', 'tm82402', 'tm88502', 'tm116534', 'tm131498', 'tm5162', 'tm209459', 'tm165135', 'tm84154', 'tm138416', 'tm56594', 'tm75528', 'tm264738', 'tm22806', 'tm84810', 'tm113646', 'tm165975', 'tm64369', 'tm2209', 'tm53874', 'tm163136', 'tm71213', 'tm69859', 'tm106873', 'tm140761', 'tm309343', 'tm8190', 'tm5661', 'tm101000', 'tm71517', 'tm67142', 'tm79526', 'tm192453', 'tm88001', 'tm9887', 'tm166542', 

In [35]:
emotion_sentiments.values()

dict_values(['sadness', 'neutral', 'neutral', 'neutral', 'neutral', 'amusement', 'optimism', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'disappointment', 'neutral', 'disappointment', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'confusion', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'love', 'neutral', 'curiosity', 'sadness', 'joy', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'realization', 'neutral', 'surprise', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'joy', 'neutral', 'neutral', 'disappointment', 'neutral', 'neutral', 'neutral', 'sadness', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'sadness', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'realization', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neu

In [36]:
# create emotion sentiments dataframe
emotion_df = pd.DataFrame({
    'id': emotion_sentiments.keys(),
    'description_emotion': emotion_sentiments.values()
})
emotion_df.head()

Unnamed: 0,id,description_emotion
0,tm87233,sadness
1,tm143047,neutral
2,tm83884,neutral
3,ts20945,neutral
4,tm5012,neutral


In [37]:
merged_df = emotion_df.merge(amazon_df, on = 'id', how = 'left')
merged_df

Unnamed: 0,id,description_emotion,roberta_neg,roberta_neu,roberta_pos,title,type,description,release_year,age_certification,...,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,primary_genre,production_country,full_country_name
0,tm87233,sadness,0.041814,0.255999,0.702187,It's a Wonderful Life,MOVIE,A holiday favourite for generations... George...,1946,PG,...,['US'],0.0,tt0038650,8.6,467766.0,27.611,8.261,drama,US,United States
1,tm143047,neutral,0.648092,0.337353,0.014555,Duck Soup,MOVIE,Rufus T. Firefly is named president/dictator o...,1933,,...,['US'],0.0,tt0023969,7.8,60933.0,9.013,7.357,comedy,US,United States
2,tm83884,neutral,0.191721,0.779214,0.029065,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,,...,['US'],0.0,tt0032599,7.8,60244.0,14.759,7.433,drama,US,United States
3,ts20945,neutral,0.020470,0.762717,0.216813,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,...,['US'],26.0,tt0850645,8.5,1149.0,15.424,7.600,comedy,US,United States
4,tm5012,neutral,0.014325,0.758843,0.226832,Red River,MOVIE,Headstrong Thomas Dunson starts a thriving Tex...,1948,,...,['US'],0.0,tt0040724,7.8,32210.0,12.400,7.400,western,US,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10867,tm1292884,neutral,0.568865,0.412574,0.018561,Hunt,MOVIE,"Arjun is the ACP of Hyderabad, who gets involv...",2023,,...,['IN'],0.0,tt21946058,5.8,1269.0,4.403,4.300,action,IN,India
10868,tm1284753,neutral,0.060710,0.886795,0.052495,Ennalum Ente Aliya,MOVIE,It tells the story of a teen girl from a Musli...,2023,,...,['IN'],0.0,tt23805304,5.6,812.0,3.406,8.000,drama,IN,India
10869,tm1303380,neutral,0.060881,0.573296,0.365823,Anubhav Singh Bassi: Bas Kar Bassi,MOVIE,"Fresh out of national law university, Bassi ar...",2023,,...,[],0.0,tt26548127,8.3,10.0,1.960,,comedy,,
10870,tm1307408,excitement,0.001599,0.018359,0.980042,Pinkfong! Summer in Wonderville,MOVIE,It’s a hot summer day in Wonderville! Let’s jo...,2023,,...,[],0.0,,,,1.336,,,,


In [38]:
merged_df.isnull().sum()

id                         0
description_emotion        0
roberta_neg                0
roberta_neu                0
roberta_pos                0
title                      0
type                       0
description                0
release_year               0
age_certification       7184
runtime                    0
genres                     0
production_countries       0
seasons                    0
imdb_id                  701
imdb_score              1108
imdb_votes              1120
tmdb_popularity          571
tmdb_score              2125
primary_genre              0
production_country         0
full_country_name          0
dtype: int64

In [39]:
# replace No Data to null values
merged_df['age_certification'].replace(np.nan, 'No Data', inplace = True)
merged_df['production_country'].replace('', 'No Data', inplace = True)
merged_df['primary_genre'].replace('', 'No Data', inplace = True)
merged_df['full_country_name'].replace('', 'No Data', inplace = True)
merged_df

Unnamed: 0,id,description_emotion,roberta_neg,roberta_neu,roberta_pos,title,type,description,release_year,age_certification,...,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,primary_genre,production_country,full_country_name
0,tm87233,sadness,0.041814,0.255999,0.702187,It's a Wonderful Life,MOVIE,A holiday favourite for generations... George...,1946,PG,...,['US'],0.0,tt0038650,8.6,467766.0,27.611,8.261,drama,US,United States
1,tm143047,neutral,0.648092,0.337353,0.014555,Duck Soup,MOVIE,Rufus T. Firefly is named president/dictator o...,1933,No Data,...,['US'],0.0,tt0023969,7.8,60933.0,9.013,7.357,comedy,US,United States
2,tm83884,neutral,0.191721,0.779214,0.029065,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,No Data,...,['US'],0.0,tt0032599,7.8,60244.0,14.759,7.433,drama,US,United States
3,ts20945,neutral,0.020470,0.762717,0.216813,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,...,['US'],26.0,tt0850645,8.5,1149.0,15.424,7.600,comedy,US,United States
4,tm5012,neutral,0.014325,0.758843,0.226832,Red River,MOVIE,Headstrong Thomas Dunson starts a thriving Tex...,1948,No Data,...,['US'],0.0,tt0040724,7.8,32210.0,12.400,7.400,western,US,United States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10867,tm1292884,neutral,0.568865,0.412574,0.018561,Hunt,MOVIE,"Arjun is the ACP of Hyderabad, who gets involv...",2023,No Data,...,['IN'],0.0,tt21946058,5.8,1269.0,4.403,4.300,action,IN,India
10868,tm1284753,neutral,0.060710,0.886795,0.052495,Ennalum Ente Aliya,MOVIE,It tells the story of a teen girl from a Musli...,2023,No Data,...,['IN'],0.0,tt23805304,5.6,812.0,3.406,8.000,drama,IN,India
10869,tm1303380,neutral,0.060881,0.573296,0.365823,Anubhav Singh Bassi: Bas Kar Bassi,MOVIE,"Fresh out of national law university, Bassi ar...",2023,No Data,...,[],0.0,tt26548127,8.3,10.0,1.960,,comedy,No Data,No Data
10870,tm1307408,excitement,0.001599,0.018359,0.980042,Pinkfong! Summer in Wonderville,MOVIE,It’s a hot summer day in Wonderville! Let’s jo...,2023,No Data,...,[],0.0,,,,1.336,,No Data,No Data,No Data


# Adding Rating Ages

In [40]:
merged_df['age_certification'].unique()

array(['PG', 'No Data', 'TV-PG', 'G', 'PG-13', 'R', 'TV-14', 'TV-Y',
       'TV-G', 'TV-Y7', 'NC-17', 'TV-MA', 'TV-Y7-FV'], dtype=object)

In [41]:
rating_ages = {
    'PG': 'Older Kids',
    'TV-PG': 'Older Kids',
    'G': 'Kids',
    'PG-13': 'Teens',
    'R': 'Adults',
    'TV-14': 'Teens',
    'TV-Y': 'Kids',
    'TV-G': 'Kids',
    'TV-Y7': 'Older Kids',
    'NC-17': 'Adults',
    'TV-MA': 'Adults',
    'TV-Y7-FV': 'Older Kids',
    'No Data': 'No Data'
}
rating_ages

{'PG': 'Older Kids',
 'TV-PG': 'Older Kids',
 'G': 'Kids',
 'PG-13': 'Teens',
 'R': 'Adults',
 'TV-14': 'Teens',
 'TV-Y': 'Kids',
 'TV-G': 'Kids',
 'TV-Y7': 'Older Kids',
 'NC-17': 'Adults',
 'TV-MA': 'Adults',
 'TV-Y7-FV': 'Older Kids',
 'No Data': 'No Data'}

In [42]:
# mapping the rating ages to the corresponding age certification
merged_df['target_ages'] = merged_df['age_certification'].map(rating_ages)
merged_df

Unnamed: 0,id,description_emotion,roberta_neg,roberta_neu,roberta_pos,title,type,description,release_year,age_certification,...,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,primary_genre,production_country,full_country_name,target_ages
0,tm87233,sadness,0.041814,0.255999,0.702187,It's a Wonderful Life,MOVIE,A holiday favourite for generations... George...,1946,PG,...,0.0,tt0038650,8.6,467766.0,27.611,8.261,drama,US,United States,Older Kids
1,tm143047,neutral,0.648092,0.337353,0.014555,Duck Soup,MOVIE,Rufus T. Firefly is named president/dictator o...,1933,No Data,...,0.0,tt0023969,7.8,60933.0,9.013,7.357,comedy,US,United States,No Data
2,tm83884,neutral,0.191721,0.779214,0.029065,His Girl Friday,MOVIE,"Hildy, the journalist former wife of newspaper...",1940,No Data,...,0.0,tt0032599,7.8,60244.0,14.759,7.433,drama,US,United States,No Data
3,ts20945,neutral,0.020470,0.762717,0.216813,The Three Stooges,SHOW,The Three Stooges were an American vaudeville ...,1934,TV-PG,...,26.0,tt0850645,8.5,1149.0,15.424,7.600,comedy,US,United States,Older Kids
4,tm5012,neutral,0.014325,0.758843,0.226832,Red River,MOVIE,Headstrong Thomas Dunson starts a thriving Tex...,1948,No Data,...,0.0,tt0040724,7.8,32210.0,12.400,7.400,western,US,United States,No Data
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10867,tm1292884,neutral,0.568865,0.412574,0.018561,Hunt,MOVIE,"Arjun is the ACP of Hyderabad, who gets involv...",2023,No Data,...,0.0,tt21946058,5.8,1269.0,4.403,4.300,action,IN,India,No Data
10868,tm1284753,neutral,0.060710,0.886795,0.052495,Ennalum Ente Aliya,MOVIE,It tells the story of a teen girl from a Musli...,2023,No Data,...,0.0,tt23805304,5.6,812.0,3.406,8.000,drama,IN,India,No Data
10869,tm1303380,neutral,0.060881,0.573296,0.365823,Anubhav Singh Bassi: Bas Kar Bassi,MOVIE,"Fresh out of national law university, Bassi ar...",2023,No Data,...,0.0,tt26548127,8.3,10.0,1.960,,comedy,No Data,No Data,No Data
10870,tm1307408,excitement,0.001599,0.018359,0.980042,Pinkfong! Summer in Wonderville,MOVIE,It’s a hot summer day in Wonderville! Let’s jo...,2023,No Data,...,0.0,,,,1.336,,No Data,No Data,No Data,No Data


In [43]:
merged_df.to_csv("Amazon_Prime_Video_with_Sentiments.csv", index = False)
merged_df.to_excel("Amazon_Prime_Video_with_Sentiments.xlsx", index = False)