## AWS Outline for LDA

In [2]:
# Main libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
#from sklearn.metrics import accuracy_score, classification_reportt

### Importing Data

In [3]:
movie_data = pd.read_csv(r"/Users/krummelha/Desktop/J2024/Project2/wiki_movie_plots_deduped.csv")
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34886 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


### Cleaning Data

In [4]:
# Select only films that are American in origin
us_movies = movie_data[movie_data['Origin/Ethnicity'] == 'American']

# Drop unwanted columns (Wiki.Page, Director, Cast)
us_movies = us_movies.drop(columns=['Wiki Page', 'Director', 'Cast'])

# Print the summary of the resulting DataFrame
us_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17377 entries, 0 to 17376
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      17377 non-null  int64 
 1   Title             17377 non-null  object
 2   Origin/Ethnicity  17377 non-null  object
 3   Genre             17377 non-null  object
 4   Plot              17377 non-null  object
dtypes: int64(1), object(4)
memory usage: 814.5+ KB


In [5]:
# Fetch Title and Plot columns
finaldata = us_movies[["Title", "Plot"]]
# Setting the movie title as index
finaldata = finaldata.set_index('Title')
finaldata.head()

Unnamed: 0_level_0,Plot
Title,Unnamed: 1_level_1
Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr..."
Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov..."
The Martyred Presidents,"The film, just over a minute long, is composed..."
"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...
Jack and the Beanstalk,The earliest known adaptation of the classic f...


### Beginning LDA

In [6]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim


In [7]:
# create English stop words list
en_stop = get_stop_words('en')

In [8]:
tokenizer = RegexpTokenizer(r'\w+')

In [9]:
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [10]:
# Compile the plot information into a list

plot_set = []
for item in finaldata["Plot"]:
    plot_set.append(item)

In [11]:
print(plot_set[1:2])

["The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better."]


In [12]:
# List for tokenized plots in a loop
texts = []

In [13]:
# loop through plot list
for item in plot_set:
    
    # clean and tokenize document string
    raw = item.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [item for item in tokens if not item in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(item) for item in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

In [14]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

In [15]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [16]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=13, id2word = dictionary, passes=20)

In [17]:
print(ldamodel.print_topics(num_topics=13, num_words=4))

[(0, '0.020*"s" + 0.011*"film" + 0.009*"school" + 0.008*"play"'), (1, '0.027*"s" + 0.010*"tell" + 0.008*"get" + 0.008*"leav"'), (2, '0.032*"s" + 0.018*"kill" + 0.014*"polic" + 0.009*"murder"'), (3, '0.021*"s" + 0.010*"find" + 0.009*"back" + 0.007*"get"'), (4, '0.100*"tom" + 0.063*"paul" + 0.060*"charli" + 0.049*"jerri"'), (5, '0.058*"larri" + 0.040*"ted" + 0.037*"sara" + 0.036*"angela"'), (6, '0.020*"s" + 0.015*"joe" + 0.013*"ben" + 0.011*"town"'), (7, '0.023*"s" + 0.009*"kill" + 0.006*"attack" + 0.006*"use"'), (8, '0.027*"s" + 0.009*"love" + 0.008*"max" + 0.008*"king"'), (9, '0.115*"mari" + 0.093*"michael" + 0.034*"fred" + 0.032*"clair"'), (10, '0.084*"david" + 0.070*"nick" + 0.061*"sam" + 0.047*"chri"'), (11, '0.066*"alic" + 0.055*"helen" + 0.038*"edward" + 0.036*"roy"'), (12, '0.031*"s" + 0.010*"kill" + 0.008*"find" + 0.007*"dr"')]
