# Inferring Topics from IMDB Reviews

In [1]:
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd
import matplotlib.pyplot as plt

## Exploring the Dataset: [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)

In [4]:
ROOT = './train/pos/'

In [5]:
reviews = []
for file in os.listdir(ROOT):
    path = os.path.join(ROOT, file)
    if os.path.isfile(path):
        with open(path, 'r') as fin:
            reviews.append(fin.read())

In [6]:
len(reviews)

2050

In [7]:
for i in range(3):
    print(reviews[i])
    print('=' * 150)

One of my favourite films first saw it when I was about 10, which probably tells you a lot about the type of humour. Although dated the humour definitely has a charm about it. Expect to see the usual Askey & Murdoch banter so popular in its day, with lots of interesting, quirky co-characters. The lady with the parrot, the couple due to get married and are in trouble from 'her', and my favourite, the stationmaster, "Nobody knows where it comes from ... nobody knows where it goes.." Interestingly the ghost train was written by Arnold Ridley of Dads Army fame (Private Godfrey the medic) Watch it on a rainy Sunday afternoon after your lunch and smile.
Having not seen the films before (and not being able to stand Matt Damon), I was reluctant to go see The Bourne Ultimatum when we were asked to see it for AS Film Studies. <br /><br />However, I was pleasantly surprised that even a film with Damon in it could be enjoyable. <br /><br />Fast fight scenes, crazy motorbike chases and BIG explosio

## Feature Extraction

In [9]:
vect = TfidfVectorizer(stop_words='english')
X = vect.fit_transform(reviews)

pd.DataFrame(X.toarray(), columns=vect.get_feature_names_out_out())

Unnamed: 0,00,000,007,0080,00s,01,02,03,05,06,...,álex,álvaro,ángel,äänekoski,åge,écran,émigrés,était,ís,østbye
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## NMF Decomposition

In [10]:
N_TOPICS = 15
nmf = NMF(n_components=N_TOPICS)
W = nmf.fit_transform(X)  # Document-topic matrix
H = nmf.components_       # Topic-term matrix



In [12]:
# Top 10 words per topic

words = np.array(vect.get_feature_names_out())
topic_words = pd.DataFrame(np.zeros((N_TOPICS, 10)), index=[f'Topic {i + 1}' for i in range(N_TOPICS)],
                           columns=[f'Word {i + 1}' for i in range(10)]).astype(str)
for i in range(N_TOPICS):
    ix = H[i].argsort()[::-1][:10]
    topic_words.iloc[i] = words[ix]

topic_words

Unnamed: 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10
Topic 1,br,10,scene,plot,high,ve,ll,story,just,quite
Topic 2,movie,movies,saw,watch,fun,recommend,acting,seen,thought,watching
Topic 3,film,films,characters,director,style,like,art,work,seen,character
Topic 4,good,really,just,like,think,don,people,know,say,didn
Topic 5,life,family,people,story,old,young,man,father,day,children
Topic 6,man,cast,role,john,character,performance,murder,best,mr,actor
Topic 7,great,job,really,acting,cast,does,music,plot,did,kidman
Topic 8,series,episode,tv,episodes,war,dvd,television,new,trek,sci
Topic 9,tom,jerry,cartoon,invisible,mouse,short,cat,cartoons,butch,lee
Topic 10,disney,cinderella,animation,holes,fairy,king,lion,mice,family,animated


In [13]:
# Create a topic mapping

topic_mapping = {
    'Topic 4': 'TV',
    'Topic 7': 'War',
    'Topic 8': 'Comedy',
    'Topic 12': 'Book Adaptation',
    'Topic 13': 'Horror',
    'Topic 15': 'Martial Arts / Action'
}

In [14]:
# Recall the document-topic matrix, W

W = pd.DataFrame(W, columns=[f'Topic {i + 1}' for i in range(N_TOPICS)])
W['max_topic'] = W.apply(lambda x: topic_mapping.get(x.idxmax()), axis=1)
W[pd.notnull(W['max_topic'])].head(10)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,max_topic
0,0.001754,0.005079,0.009782,0.007382,0.012188,0.004189,0.0,0.000254,0.004563,0.001146,0.009166,0.01945,0.010549,0.00571,0.023677,Martial Arts / Action
5,0.0,0.029777,0.005211,0.0,0.033378,0.0,0.026178,0.016732,0.0,0.0,0.0,0.139184,0.0,0.0,0.005651,Book Adaptation
12,0.021327,0.003745,0.003796,0.027673,0.010666,0.001247,0.0,0.007088,0.004035,0.0,0.00383,0.0,0.0,0.001634,0.0,TV
21,0.011278,0.0,0.018468,0.016934,0.018807,0.013045,0.004115,0.021908,0.004862,0.005854,0.0,0.0,0.0,0.007614,0.008256,Comedy
22,0.0,0.030793,0.018726,0.001236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061675,Martial Arts / Action
23,0.0,0.012135,0.030849,0.090028,0.0,0.0,0.0058,0.0,0.000117,0.0,0.0,0.078798,0.0,0.0,0.0,TV
29,0.01897,0.001983,0.0,0.005131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35302,0.012196,0.0,Horror
31,0.0,0.018864,0.049291,0.004012,0.0,0.0,0.012509,0.0,0.0,0.0,0.0,0.173679,0.0,0.0,0.0,Book Adaptation
38,0.046063,0.021134,0.054183,0.02124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198766,0.010097,0.0,0.0,Book Adaptation
40,0.049831,0.021143,0.01827,0.036501,0.000763,0.026611,0.058917,0.0,0.0,0.0,0.005735,0.013233,0.009038,0.014872,0.0,War


In [15]:
reviews[58]

'James Dickey is a wonderfully descriptive author. When one reads "Deliverance", one is instantly transported into the lush backwoods of the Deep South. When one watches John Boorman\'s film version of the book, one realizes just how accurately he captures the essence of the book. The camera is as descriptive as the narration. The characters are fully realized, and the portrayals are fantastic. I first saw this movie in 1992, after my freshman year of college. I was in a phase where I was watching movies that were all released within a couple of years of my birth in 1973. Among them were "Patton", "Papillon", and "All the President\'s Men"; fine films, all of them. This one was easily the class of the group. That says a lot.'

In [16]:
# Frobenius norm

import numpy as np

print("Frobenius norm and the condition number:")
print(np.linalg.norm([[1,1,1],[3,4,1],[4,1,2]], 'fro'))
print(np.linalg.cond([[1,1,1],[3,4,1],[4,1,2]], 'fro'))


Frobenius norm and the condition number:
7.0710678118654755
13.975424859373685
