# Q3

*   We will develop a NER system specific to the category of names of the top 1000 movie titles from IMDB.

*   We will evaluate the system on a collection of text likely to contain instances of these named entities.

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
import re
import csv
import math
import nltk
nltk.download('brown')
nltk.download('movie_reviews')
from nltk.corpus import brown, movie_reviews
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [43]:
def get_top_1000_list(filename='/content/drive/MyDrive/data/IMDB-top-1000.csv'):
    """
    Function to extract movie titles from a IMDB-top-1000.csv file.

    Returns:
        list: A list of unique titles of the top 1000 movies
    """
    collected_titles = []
    with open(filename, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            collected_titles.append(row[1].split())  # Assuming movie titles are in the second column
    collected_titles.pop(0)
    return collected_titles

In [44]:
get_top_1000_list()

[['The', 'Shawshank', 'Redemption'],
 ['The', 'Godfather'],
 ['The', 'Dark', 'Knight'],
 ['The', 'Lord', 'of', 'the', 'Rings:', 'The', 'Return', 'of', 'the', 'King'],
 ["Schindler's", 'List'],
 ['The', 'Godfather', 'Part', 'II'],
 ['12', 'Angry', 'Men'],
 ['Jai', 'Bhim'],
 ['Pulp', 'Fiction'],
 ['Inception'],
 ['The', 'Lord', 'of', 'the', 'Rings:', 'The', 'Two', 'Towers'],
 ['Fight', 'Club'],
 ['The',
  'Lord',
  'of',
  'the',
  'Rings:',
  'The',
  'Fellowship',
  'of',
  'the',
  'Ring'],
 ['Forrest', 'Gump'],
 ['The', 'Good,', 'the', 'Bad', 'and', 'the', 'Ugly'],
 ['Soorarai', 'Pottru'],
 ['The', 'Matrix'],
 ['Goodfellas'],
 ['Star', 'Wars:', 'Episode', 'V', '-', 'The', 'Empire', 'Strikes', 'Back'],
 ['One', 'Flew', 'Over', 'the', "Cuckoo's", 'Nest'],
 ['Top', 'Gun:', 'Maverick'],
 ['Interstellar'],
 ['City', 'of', 'God'],
 ['Spirited', 'Away'],
 ['Saving', 'Private', 'Ryan'],
 ['The', 'Green', 'Mile'],
 ['Life', 'Is', 'Beautiful'],
 ['Seven'],
 ['Terminator', '2:', 'Judgment', 'Da

In [45]:
def label_BIO(_tokens, _NE):
    BIO_for_samples = []

    for token in _tokens:
        # Initialize the tag for the current token as 'O'
        tag = 'O'

        # Check if the token is part of any named entity (NE)
        for ne in _NE:
            if token in ne:
                # If the token is part of a named entity, assign appropriate BIO tags
                start_index = ne.index(token)
                if start_index == 0:
                    tag = 'B-MOV'
                else:
                    tag = 'I-MOV'
                break

        # Append the token and its BIO tag to the result list
        BIO_for_samples.append((token, tag))

    return BIO_for_samples

# Example usage:
tokens = ["The", "Butterfly", "Effect", "is", "a", "movie", ".", "I", "Am", "Sam", "is", "another", "film", "."]
NE = [["The", "Butterfly", "Effect"], ["I", "Am", "Sam"]]
result = label_BIO(tokens, NE)
print(result)


[('The', 'B-MOV'), ('Butterfly', 'I-MOV'), ('Effect', 'I-MOV'), ('is', 'O'), ('a', 'O'), ('movie', 'O'), ('.', 'O'), ('I', 'B-MOV'), ('Am', 'I-MOV'), ('Sam', 'I-MOV'), ('is', 'O'), ('another', 'O'), ('film', 'O'), ('.', 'O')]


In [46]:
# Don't change this cell
def print_BIO_res(_BIO):
    for i in range(len(_BIO)):
        if _BIO[i][1] == 'B-MOV':
            for j in range(i - 7, i + 7):
                if _BIO[j][1] == 'O':
                    print(_BIO[j][0], end=" ")
                else:
                    print(_BIO[j], end=" ")
            print("")

In [34]:
# Don't change this cell
def get_data_from_file(_fn):
    with open(_fn, 'r') as file:
        data = file.read().replace('\n', ' ')
    return data

In [47]:
titles_top_1000 = get_top_1000_list()

# get text data from a text file
data = get_data_from_file("/content/drive/MyDrive/data/article-about-a-genre.txt")
# tokenize text data
tokens = word_tokenize(data)
# tag with BIO using the IMDB top 1000 movie title list
BIO = label_BIO(tokens, titles_top_1000)

print_BIO_res(BIO)
# BIO[:100]

('Ten', 'I-MOV') Rings is shaping up ('to', 'I-MOV') overtake ('Black', 'B-MOV') Widow ('as', 'I-MOV') ('the', 'I-MOV') biggest film ('of', 'I-MOV') 
('the', 'I-MOV') biggest film ('of', 'I-MOV') ('the', 'I-MOV') pandemic . ('A', 'B-MOV') hit ('with', 'I-MOV') critics ('and', 'I-MOV') audience alike 
history almost ('as', 'I-MOV') long cinema itself . ('This', 'B-MOV') history is ('on', 'I-MOV') exciting display ('in', 'I-MOV') 
heroes ('with', 'I-MOV') supernatural martial arts abilities . ('Fight', 'B-MOV') scenes ('in', 'I-MOV') these early films emphasised 
rarely showcased actual martial arts skills . ('This', 'B-MOV') changed ('with', 'I-MOV') ('the', 'I-MOV') transformation ('of', 'I-MOV') Hong 
Five Deadly Venoms ( 1978 ) ('and', 'I-MOV') ('The', 'B-MOV') 36th Chamber ('of', 'I-MOV') Shaolin ( 1978 
style , ('as', 'I-MOV') shown ('in', 'I-MOV') films like ('The', 'B-MOV') ('Big', 'I-MOV') Boss ( 1971 ) ('and', 'I-MOV') 
('The', 'B-MOV') ('Big', 'I-MOV') Boss ( 1971 ) ('and', 'I