# Q3

*   We will develop a NER system specific to the category of names of the top 1000 movie titles from IMDB.

*   We will evaluate the system on a collection of text likely to contain instances of these named entities.

In [26]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [27]:
import re
import csv
import math
import nltk
nltk.download('brown')
nltk.download('movie_reviews')
from nltk.corpus import brown, movie_reviews
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [29]:
def get_top_1000_list():
    """
    Function to extract movie titles from a IMDB-top-1000.csv file.

    Returns:
        list: A list of unique titles of the top 1000 movies
    """
    with open("./data/IMDB-top-1000.csv", 'r') as file:
        csvreader = csv.reader(file)
        collected_titles = []
        for row in csvreader:
            raw_title = row[1].strip()
            # so that no duplicates is collected
            if raw_title not in collected_titles:
                collected_titles.append(raw_title.split())
    return collected_titles[1:]

In [43]:
def label_BIO(_tokens, _NE):
    """
    Generates BIO (Beginning, Inside, Outside) tags for movie titles in the given tokens.

    Args:
        _tokens (list): List of tokens representing words in a sentence.
        _NE (list): List of named entities, where each entity is represented as a list of tokens.

    Returns:
        list: List of tuples containing tokens and their corresponding BIO tags.

    Comments:
        - This function searches for movie titles in the tokens and labels them using BIO notation.
        - A movie title is considered to be a named entity, where the first word is labeled as 'B-MOV'
          (Beginning of a movie title) and subsequent words are labeled as 'I-MOV' (Inside a movie title).
        - Non-movie title tokens are labeled as 'O' (Outside any named entity).
        - The function iterates through each token in the tokens list, searching for matches in the named entity list.
          If a match is found, the corresponding tokens are labeled accordingly in the BIO format.
        - It returns a list of tuples, each containing a token and its corresponding BIO tag.
    """

    # MOV: movie title
    # beginning of a movie title is tagged with "B-MOV"
    # middle/end of a movie title is tagged "I-MOV"

    BIO_for_samples = []
    sample_size = len(_tokens)

    current_i = 0

    # search through each sample
    for i in range(sample_size):

        if i != current_i: continue

        # search each named entity for each sample
        has_match = False
        match_index = -1
        for j in range(len(_NE)):

            # if the first word is a match
            if _NE[j][0] == _tokens[i]:

                # if the matched movie title does not
                # exceed the length of the sample
                if i + len(_NE[j]) < sample_size:

                    # check if all other components match
                    no_match = False
                    for k in range(1, len(_NE[j])):
                        if _NE[j][k] != _tokens[i + k]:
                            no_match = True
                            break

                    # full match found
                    if not no_match:
                        has_match = True
                        match_index = j
                        break

        # Done searching through the Named Entity List
        if has_match:
            BIO_for_samples.append((_tokens[i], 'B-MOV'))
            for j in range(1, len(_NE[match_index])):
                BIO_for_samples.append((_tokens[i + j], 'I-MOV'))
            current_i += len(_NE[match_index])
        else:
            BIO_for_samples.append((_tokens[i], 'O'))
            current_i += 1

    return BIO_for_samples

In [44]:
# Don't change this cell
def print_BIO_res(_BIO):
    for i in range(len(_BIO)):
        if _BIO[i][1] == 'B-MOV':
            for j in range(i - 7, i + 7):
                if _BIO[j][1] == 'O':
                    print(_BIO[j][0], end=" ")
                else:
                    print(_BIO[j], end=" ")
            print("")

In [45]:
# Don't change this cell
def get_data_from_file(_fn):
    with open(_fn, 'r') as file:
        data = file.read().replace('\n', ' ')
    return data

In [50]:
titles_top_1000 = get_top_1000_list()

# get text data from a text file
data = get_data_from_file("data/article-about-a-genre.txt")
# tokenize text data
tokens = word_tokenize(data)
# tag with BIO using the IMDB top 1000 movie title list
BIO = label_BIO(tokens, titles_top_1000)

# B-MOV marks the beginning of a known movie title
# I-MOV marks the middle/end of a known movie title
# O for all other words
print_BIO_res(BIO)

Ten Rings is shaping up to overtake ('Black', 'B-MOV') Widow as the biggest film of 
. With films like Chan ’ s ('Rush', 'B-MOV') Hour ( 1998 ) and Shanghai 
to find its way into hits like ('The', 'B-MOV') ('Matrix', 'I-MOV') ( 1999 ) and Kill 
the trend . Jet Li ’ s ('Hero', 'B-MOV') ( 2002 ) and Fearless ( 
comedies Shaolin Soccer ( 2001 ) and ('Kung', 'B-MOV') ('Fu', 'I-MOV') ('Hustle', 'I-MOV') ( 2004 ) , 
) , and Donnie Yen ’ s ('Ip', 'B-MOV') ('Man', 'I-MOV') ( 2008 ) . Shang-Chi 
