In [1]:
import numpy as np
import pandas as pd
import nltk
import string
import seaborn as sb
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../wiki_movie_plots_deduped.csv')
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [3]:
df = df.drop_duplicates(subset='Plot', keep='first')

In [4]:
text = df['Plot']
print(type(text))
print(text.shape)

<class 'pandas.core.series.Series'>
(33869,)


## POS with NLTK

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set(stopwords.words('english'))

In [6]:
def tokenize(sentence):
    tokenized = sent_tokenize(sentence)
    words_list = []
    for i in tokenized:
        wt = nltk.word_tokenize(i)
        words_list.append(wt)#[w for w in wt if not w in stop_words]
    return words_list

In [7]:
token_list = [tokenize(doc) for doc in text]

In [8]:
token_list[0]

[['A',
  'bartender',
  'is',
  'working',
  'at',
  'a',
  'saloon',
  ',',
  'serving',
  'drinks',
  'to',
  'customers',
  '.'],
 ['After',
  'he',
  'fills',
  'a',
  'stereotypically',
  'Irish',
  'man',
  "'s",
  'bucket',
  'with',
  'beer',
  ',',
  'Carrie',
  'Nation',
  'and',
  'her',
  'followers',
  'burst',
  'inside',
  '.'],
 ['They',
  'assault',
  'the',
  'Irish',
  'man',
  ',',
  'pulling',
  'his',
  'hat',
  'over',
  'his',
  'eyes',
  'and',
  'then',
  'dumping',
  'the',
  'beer',
  'over',
  'his',
  'head',
  '.'],
 ['The',
  'group',
  'then',
  'begin',
  'wrecking',
  'the',
  'bar',
  ',',
  'smashing',
  'the',
  'fixtures',
  ',',
  'mirrors',
  ',',
  'and',
  'breaking',
  'the',
  'cash',
  'register',
  '.'],
 ['The',
  'bartender',
  'then',
  'sprays',
  'seltzer',
  'water',
  'in',
  'Nation',
  "'s",
  'face',
  'before',
  'a',
  'group',
  'of',
  'policemen',
  'appear',
  'and',
  'order',
  'everybody',
  'to',
  'leave',
  '.'],
 ['[

In [9]:
tagged_list = [[nltk.pos_tag(words) for words in tk] for tk in token_list]

In [10]:
tagged_list[0]

[[('A', 'DT'),
  ('bartender', 'NN'),
  ('is', 'VBZ'),
  ('working', 'VBG'),
  ('at', 'IN'),
  ('a', 'DT'),
  ('saloon', 'NN'),
  (',', ','),
  ('serving', 'VBG'),
  ('drinks', 'NNS'),
  ('to', 'TO'),
  ('customers', 'NNS'),
  ('.', '.')],
 [('After', 'IN'),
  ('he', 'PRP'),
  ('fills', 'VBZ'),
  ('a', 'DT'),
  ('stereotypically', 'RB'),
  ('Irish', 'JJ'),
  ('man', 'NN'),
  ("'s", 'POS'),
  ('bucket', 'NN'),
  ('with', 'IN'),
  ('beer', 'NN'),
  (',', ','),
  ('Carrie', 'NNP'),
  ('Nation', 'NNP'),
  ('and', 'CC'),
  ('her', 'PRP$'),
  ('followers', 'NNS'),
  ('burst', 'JJ'),
  ('inside', 'RB'),
  ('.', '.')],
 [('They', 'PRP'),
  ('assault', 'VBP'),
  ('the', 'DT'),
  ('Irish', 'NNP'),
  ('man', 'NN'),
  (',', ','),
  ('pulling', 'VBG'),
  ('his', 'PRP$'),
  ('hat', 'NN'),
  ('over', 'IN'),
  ('his', 'PRP$'),
  ('eyes', 'NNS'),
  ('and', 'CC'),
  ('then', 'RB'),
  ('dumping', 'VBG'),
  ('the', 'DT'),
  ('beer', 'NN'),
  ('over', 'IN'),
  ('his', 'PRP$'),
  ('head', 'NN'),
  ('.', '.'

In [11]:
pred_tags = [[item for sublist in tsl for item in sublist] for tsl in tagged_list]

In [12]:
pred_tags[0]

[('A', 'DT'),
 ('bartender', 'NN'),
 ('is', 'VBZ'),
 ('working', 'VBG'),
 ('at', 'IN'),
 ('a', 'DT'),
 ('saloon', 'NN'),
 (',', ','),
 ('serving', 'VBG'),
 ('drinks', 'NNS'),
 ('to', 'TO'),
 ('customers', 'NNS'),
 ('.', '.'),
 ('After', 'IN'),
 ('he', 'PRP'),
 ('fills', 'VBZ'),
 ('a', 'DT'),
 ('stereotypically', 'RB'),
 ('Irish', 'JJ'),
 ('man', 'NN'),
 ("'s", 'POS'),
 ('bucket', 'NN'),
 ('with', 'IN'),
 ('beer', 'NN'),
 (',', ','),
 ('Carrie', 'NNP'),
 ('Nation', 'NNP'),
 ('and', 'CC'),
 ('her', 'PRP$'),
 ('followers', 'NNS'),
 ('burst', 'JJ'),
 ('inside', 'RB'),
 ('.', '.'),
 ('They', 'PRP'),
 ('assault', 'VBP'),
 ('the', 'DT'),
 ('Irish', 'NNP'),
 ('man', 'NN'),
 (',', ','),
 ('pulling', 'VBG'),
 ('his', 'PRP$'),
 ('hat', 'NN'),
 ('over', 'IN'),
 ('his', 'PRP$'),
 ('eyes', 'NNS'),
 ('and', 'CC'),
 ('then', 'RB'),
 ('dumping', 'VBG'),
 ('the', 'DT'),
 ('beer', 'NN'),
 ('over', 'IN'),
 ('his', 'PRP$'),
 ('head', 'NN'),
 ('.', '.'),
 ('The', 'DT'),
 ('group', 'NN'),
 ('then', 'RB'),
 (

In [13]:
#saving pred_tags
import pickle

with open('../Output/pred_tags_nltk', 'wb') as fp:
    pickle.dump(pred_tags, fp)