# Q3

*   We will develop a NER system specific to the category of names of the top 1000 movie titles from IMDB.

*   We will evaluate the system on a collection of text likely to contain instances of these named entities.

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import re
import csv
import math
import nltk
nltk.download('brown')
nltk.download('movie_reviews')
from nltk.corpus import brown, movie_reviews
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [None]:
def get_top_1000_list():
    """
    Function to extract movie titles from a IMDB-top-1000.csv file.

    Returns:
        list: A list of unique titles of the top 1000 movies
    """
    collected_titles = []

    with open('IMDB-top-1000.csv', 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            title = row[1]
            collected_titles.append(title.split())

    return collected_titles

In [None]:
def label_BIO(_tokens, _NE):
    """
    Generates BIO (Beginning, Inside, Outside) tags for movie titles in the given tokens.

    Args:
        _tokens (list): List of tokens representing words in a sentence.
        _NE (list): List of named entities, where each entity is represented as a list of tokens.

    Returns:
        list: List of tuples containing tokens and their corresponding BIO tags.

    Comments:
        - This function searches for movie titles in the tokens and labels them using BIO notation.
        - A movie title is considered to be a named entity, where the first word is labeled as 'B-MOV'
          (Beginning of a movie title) and subsequent words are labeled as 'I-MOV' (Inside a movie title).
        - Non-movie title tokens are labeled as 'O' (Outside any named entity).
        - The function iterates through each token in the tokens list, searching for matches in the named entity list.
          If a match is found, the corresponding tokens are labeled accordingly in the BIO format.
        - It returns a list of tuples, each containing a token and its corresponding BIO tag.
    """
    BIO_for_samples = []

    for token in _tokens:
        # Initialize the BIO tag for the token as 'O' (Outside any named entity)
        bio_tag = 'O'

        # Check if the token is part of any named entity (NE)
        for ne in _NE:
            # If the token matches the first token of a named entity:
            if token == ne[0]:
                bio_tag = 'B-MOV'
                break
            # If the token matches any subsequent token of a named entity:
            elif token in ne:
                bio_tag = 'I-MOV'
                break

        # Append the token and its BIO tag to the output list
        BIO_for_samples.append((token, bio_tag))


    return BIO_for_samples

In [None]:
# Don't change this cell
def print_BIO_res(_BIO):
    for i in range(len(_BIO)):
        if _BIO[i][1] == 'B-MOV':
            for j in range(i - 7, i + 7):
                if _BIO[j][1] == 'O':
                    print(_BIO[j][0], end=" ")
                else:
                    print(_BIO[j], end=" ")
            print("")

In [None]:
# Don't change this cell
def get_data_from_file(_fn):
    with open(_fn, 'r') as file:
        data = file.read().replace('\n', ' ')
    return data

In [None]:
titles_top_1000 = get_top_1000_list()

# get text data from a text file
data = get_data_from_file("article-about-a-genre.txt")
# tokenize text data
tokens = word_tokenize(data)
# tag with BIO using the IMDB top 1000 movie title list
BIO = label_BIO(tokens, titles_top_1000)

print_BIO_res(BIO)

('Ten', 'I-MOV') Rings is shaping up ('to', 'I-MOV') overtake ('Black', 'B-MOV') Widow ('as', 'I-MOV') ('the', 'I-MOV') biggest film ('of', 'I-MOV') 
('the', 'I-MOV') biggest film ('of', 'I-MOV') ('the', 'I-MOV') pandemic . ('A', 'B-MOV') hit ('with', 'I-MOV') critics ('and', 'I-MOV') audience alike 
history almost ('as', 'I-MOV') long cinema itself . ('This', 'B-MOV') history is ('on', 'I-MOV') exciting display ('in', 'I-MOV') 
heroes ('with', 'I-MOV') supernatural martial arts abilities . ('Fight', 'B-MOV') scenes ('in', 'I-MOV') these early films emphasised 
rarely showcased actual martial arts skills . ('This', 'B-MOV') changed ('with', 'I-MOV') ('the', 'I-MOV') transformation ('of', 'I-MOV') Hong 
Five Deadly Venoms ( 1978 ) ('and', 'I-MOV') ('The', 'B-MOV') 36th Chamber ('of', 'I-MOV') Shaolin ( 1978 
style , ('as', 'I-MOV') shown ('in', 'I-MOV') films like ('The', 'B-MOV') ('Big', 'I-MOV') Boss ( 1971 ) ('and', 'I-MOV') 
('The', 'B-MOV') ('Big', 'I-MOV') Boss ( 1971 ) ('and', 'I

as we see the output is tokenized text from the input data, where each token is accompanied by its corresponding BIO tag. For example, the token "Black" is labeled as "B-MOV" (Beginning of a movie title), while the subsequent token "Widow" is labeled as "I-MOV" (Inside a movie title).

In [None]:
print(BIO)

[('From', 'I-MOV'), ('Bruce', 'O'), ('Lee', 'O'), ('to', 'I-MOV'), ('Shang-Chi', 'O'), (':', 'O'), ('a', 'I-MOV'), ('short', 'O'), ('history', 'O'), ('of', 'I-MOV'), ('the', 'I-MOV'), ('kung', 'O'), ('fu', 'O'), ('film', 'O'), ('in', 'I-MOV'), ('cinema', 'O'), ('With', 'I-MOV'), ('action', 'O'), ('sequences', 'O'), ('that', 'O'), ('are', 'O'), ('being', 'O'), ('hailed', 'O'), ('as', 'I-MOV'), ('some', 'O'), ('of', 'I-MOV'), ('the', 'I-MOV'), ('best', 'O'), ('in', 'I-MOV'), ('the', 'I-MOV'), ('history', 'O'), ('of', 'I-MOV'), ('the', 'I-MOV'), ('Marvel', 'O'), ('Cinematic', 'O'), ('Universe', 'O'), (',', 'O'), ('Shang-Chi', 'O'), ('and', 'I-MOV'), ('the', 'I-MOV'), ('Legend', 'I-MOV'), ('of', 'I-MOV'), ('the', 'I-MOV'), ('Ten', 'I-MOV'), ('Rings', 'O'), ('is', 'O'), ('shaping', 'O'), ('up', 'O'), ('to', 'I-MOV'), ('overtake', 'O'), ('Black', 'B-MOV'), ('Widow', 'O'), ('as', 'I-MOV'), ('the', 'I-MOV'), ('biggest', 'O'), ('film', 'O'), ('of', 'I-MOV'), ('the', 'I-MOV'), ('pandemic', 'O'),

we can infer that Tokens not belonging to any movie title are labeled as "O".
The returned BIO list indicates that the function performs as expected and can serve as a useful component in developing a NER system tailored to movie titles.