# Inferring Topics from IMDB Reviews

In [1]:
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd
import matplotlib.pyplot as plt

## Exploring the Dataset: [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)

In [2]:
ROOT = '../aclImdb/train/pos/'
reviews = []
for file in os.listdir(ROOT):
    path = os.path.join(ROOT, file)
    if os.path.isfile(path):
        with open(path, 'r') as fin:
            reviews.append(fin.read())

In [3]:
len(reviews)

12500

In [4]:
for i in range(3):
    print(reviews[i])
    print('=' * 150)

For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.
Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV's "Flamingo Road") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina's pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inhabited by a blind priest on the top floor), things really start cooking. The neighbors, including a fantastically wicked Burgess Meredith and kinky couple Sylvia Miles & Beverly D'Angelo, are a diabolical lot, and Eli Wallach is great fun as a wily police detective. The

## Feature Extraction

In [5]:
vect = TfidfVectorizer(stop_words='english')
X = vect.fit_transform(reviews)

pd.DataFrame(X.toarray(), columns=vect.get_feature_names())

Unnamed: 0,00,000,000s,003830,006,007,0079,0080,0083,0093638,...,élan,émigré,émigrés,était,état,étc,êxtase,ís,østbye,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## NMF Decomposition

In [6]:
N_TOPICS = 15
nmf = NMF(n_components=N_TOPICS)
W = nmf.fit_transform(X)  # Document-topic matrix
H = nmf.components_       # Topic-term matrix



In [7]:
# Top 10 words per topic
words = np.array(vect.get_feature_names())
topic_words = pd.DataFrame(np.zeros((N_TOPICS, 10)), index=[f'Topic {i + 1}' for i in range(N_TOPICS)],
                           columns=[f'Word {i + 1}' for i in range(10)]).astype(str)
for i in range(N_TOPICS):
    ix = H[i].argsort()[::-1][:10]
    topic_words.iloc[i] = words[ix]

topic_words

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10
Topic 1,br,10,ll,spoilers,end,simply,yes,plot,just,spoiler
Topic 2,movie,movies,watch,recommend,saw,10,definitely,enjoyed,makes,watching
Topic 3,film,films,director,scenes,characters,cinema,plot,festival,work,art
Topic 4,series,episode,episodes,season,tv,characters,trek,seasons,shows,television
Topic 5,man,role,character,performance,plays,john,does,played,father,scene
Topic 6,good,action,pretty,story,bad,acting,really,plot,scenes,nice
Topic 7,war,world,documentary,people,american,history,soldiers,men,women,hitler
Topic 8,funny,comedy,laugh,hilarious,fun,eddie,jokes,humor,funniest,murphy
Topic 9,like,really,think,just,don,people,know,say,didn,watch
Topic 10,kids,family,old,years,disney,children,saw,time,remember,little


In [8]:
# Create a topic mapping

topic_mapping = {
    'Topic 4': 'TV',
    'Topic 7': 'War',
    'Topic 8': 'Comedy',
    'Topic 12': 'Book Adaptation',
    'Topic 13': 'Horror',
    'Topic 15': 'Martial Arts / Action'
}

In [9]:
# Recall the document-topic matrix, W
W = pd.DataFrame(W, columns=[f'Topic {i + 1}' for i in range(N_TOPICS)])
print(W.shape)
W['max_topic'] = W.apply(lambda x: topic_mapping.get(x.idxmax()), axis=1)
W[pd.notnull(W['max_topic'])].head(10)

(12500, 15)


Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,max_topic
1,0.0,0.015769,0.0,0.003017,0.010958,0.002338,0.0,0.001545,0.000668,0.000492,0.0,0.00032,0.044646,0.017596,0.001294,Horror
4,5.3e-05,0.0,0.0,0.037117,0.000832,0.0,0.0,0.0,0.005521,0.030363,0.002332,0.005324,0.0,0.022419,0.030687,TV
17,0.01203,0.0,0.001051,0.015024,0.014388,0.000491,0.004621,0.042989,0.0,0.004442,0.0,0.0,0.001843,0.012249,0.0,Comedy
18,0.011419,0.058319,0.0,0.0,0.0,0.016326,0.0,0.0,0.021421,0.011607,0.0,0.062748,0.0,0.00619,0.000166,Book Adaptation
24,0.017393,0.013487,0.001179,0.000692,0.013206,0.00247,0.020492,0.00232,0.00022,0.000756,0.016587,0.000626,0.0,0.01006,0.0,War
27,0.038921,0.022109,0.012303,1.4e-05,0.009371,0.002917,0.003613,0.003866,0.003096,0.002472,0.009002,0.0,0.065038,0.004335,0.005657,Horror
31,0.0,0.001965,0.0,0.044606,0.0296,0.012175,0.0,0.006613,0.008433,0.003966,0.0,0.0,0.0,0.00764,0.002365,TV
35,0.0,0.0,0.005251,0.0,0.010681,0.00899,0.008599,0.0,0.006195,0.0,0.007875,0.0,0.011324,0.0,0.042295,Martial Arts / Action
41,0.0,0.0,0.0,0.009448,0.0,0.006847,0.0,0.0,0.004033,0.001033,0.0,0.17799,0.0,0.0,0.048359,Book Adaptation
43,0.022421,0.007169,0.014238,0.0,0.007134,0.0,0.0,0.001342,0.0,1.9e-05,0.019565,0.022687,0.043617,0.010864,0.007921,Horror


In [10]:
reviews[58]

'I sit through movies like "Tiempo de valientes" and I want to talk about cinema for hours. The admiration this movie caused me is beyond my own limits of explanation, because I\'m watching the scenes of the film and I search inside my thoughts for film-making ideas and dialogue innovations that could emerge from something bigger than Damian Szifron\'s mind.<br /><br />Looking the environment, so uncompromised, so simple, I\'m thinking; this man is a genius. No wonder he created what is probably the best television show Argentina ever witnessed, and then a first movie full of elements some contemporary directors haven\'t still achieved. "El fondo del mar" is the name and, it awakened (a few years ago), my enthusiasm for our everyday cinema.<br /><br />Starting his journey from people\'s daily real lives, Szifron goes where Pablo Trapero never could in "El Bonaerense"; the Federal Police Department\'s life. Trapero\'s film was a journey into a man\'s mind and experiences, not into the p