# Q3

*   We will develop a NER system specific to the category of names of the top 1000 movie titles from IMDB.

*   We will evaluate the system on a collection of text likely to contain instances of these named entities.

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import re
import csv
import math
import nltk
nltk.download('brown')
nltk.download('movie_reviews')
from nltk.corpus import brown, movie_reviews
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [9]:
def get_top_1000_list():
    """
    Function to extract movie titles from a IMDB-top-1000.csv file.

    Returns:
        list: A list of unique titles of the top 1000 movies
    """

    collected_titles = set()
    file_path = f"IMDB-top-1000.csv"  # Construct the complete file path
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            title = row[1]
            collected_titles.add(title)

    return list(collected_titles)

In [10]:
def label_BIO(_tokens, _NE):
    """
    Generates BIO (Beginning, Inside, Outside) tags for movie titles in the given tokens.

    Args:
        _tokens (list): List of tokens representing words in a sentence.
        _NE (list): List of named entities, where each entity is represented as a list of tokens.

    Returns:
        list: List of tuples containing tokens and their corresponding BIO tags.

    Comments:
        - This function searches for movie titles in the tokens and labels them using BIO notation.
        - A movie title is considered to be a named entity, where the first word is labeled as 'B-MOV'
          (Beginning of a movie title) and subsequent words are labeled as 'I-MOV' (Inside a movie title).
        - Non-movie title tokens are labeled as 'O' (Outside any named entity).
        - The function iterates through each token in the tokens list, searching for matches in the named entity list.
          If a match is found, the corresponding tokens are labeled accordingly in the BIO format.
        - It returns a list of tuples, each containing a token and its corresponding BIO tag.
    """
    BIO_for_samples = []

    for i, token in enumerate(_tokens):
      # Check if token is present in any named entity (movie title)
      is_movie_title = any(token in entity for entity in _NE)

      if is_movie_title:
        # Check if this is the first token of the movie title
        if any(token == entity[0] for entity in _NE):
          BIO_for_samples.append((token, "B-MOV"))  # Beginning of Movie Title
        else:
          BIO_for_samples.append((token, "I-MOV"))  # Inside Movie Title
      else:
        BIO_for_samples.append((token, "O"))  # Outside any named entity

    return BIO_for_samples

In [11]:
# Don't change this cell
def print_BIO_res(_BIO):
    for i in range(len(_BIO)):
        if _BIO[i][1] == 'B-MOV':
            for j in range(i - 7, i + 7):
                if _BIO[j][1] == 'O':
                    print(_BIO[j][0], end=" ")
                else:
                    print(_BIO[j], end=" ")
            print("")

In [12]:
# Don't change this cell
def get_data_from_file(_fn):
    with open(_fn, 'r') as file:
        data = file.read().replace('\n', ' ')
    return data

In [14]:


titles_top_1000 = get_top_1000_list()
# get text data from a text file
data = get_data_from_file("article-about-a-genre.txt")
# tokenize text data
tokens = word_tokenize(data)
# tag with BIO using the IMDB top 1000 movie title list
BIO = label_BIO(tokens, titles_top_1000)

print_BIO_res(BIO)

('the', 'I-MOV') biggest film ('of', 'I-MOV') ('the', 'I-MOV') pandemic ('.', 'I-MOV') ('A', 'B-MOV') ('hit', 'I-MOV') ('with', 'I-MOV') critics ('and', 'I-MOV') audience alike 
