In [8]:
# Load libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
#from sklearn.metrics import accuracy_score, classification_reportt

## Importing the Data

In [9]:
movie_data = pd.read_csv(r"/Users/alexandratorti/DS PROJECT COURSE/Project 2/wiki_movie_plots_deduped.csv")
movie_data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34886 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


## Cleaning the Data (Dropping)

In [10]:
# Select only films that are American in origin
us_movies = movie_data[movie_data['Origin/Ethnicity'] == 'American']

# Drop unwanted columns (Wiki.Page, Director, Cast)
us_movies = us_movies.drop(columns=['Wiki Page', 'Director', 'Cast'])

# Print the summary of the resulting DataFrame
us_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17377 entries, 0 to 17376
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      17377 non-null  int64 
 1   Title             17377 non-null  object
 2   Origin/Ethnicity  17377 non-null  object
 3   Genre             17377 non-null  object
 4   Plot              17377 non-null  object
dtypes: int64(1), object(4)
memory usage: 814.5+ KB


In [17]:
# Fetch Title and Plot columns
finaldata = us_movies[["Title", "Plot"]]
# Setting the movie title as index
finaldata = finaldata.set_index('Title')
finaldata.head()

Unnamed: 0_level_0,Plot
Title,Unnamed: 1_level_1
Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr..."
Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov..."
The Martyred Presidents,"The film, just over a minute long, is composed..."
"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...
Jack and the Beanstalk,The earliest known adaptation of the classic f...


In [11]:
#Project two, Movie recommender

## Guiding Question
- How do we create a machine learning model that offers recommendations based on movie plot? 

## Data Background
- Data Dimensions 
- ~35,000 movies (rows) x 8 columns 

### Columns in the data
- Year
- Title
- Origin/Ethnicity (dropping everything that’s not American) 
- Director (dropping)
- Cast (dropping)
- Genre
- Wiki page (dropping) 
- Plot 

### What our cleaned data set looks like (before LDA pre-processing)
- 5 x ~17,500
- Year, Title, Origin = American, Genre, Plot

## LDA Background 
- Because our recommender will be based of text recognition and synopsis, we must have a way to identify trends within each text piece.
- LDA (Latent Dirichlet Allocation) is a topic modeling technique used for extracting topics from text documents. It finds topics that the “text” belongs to on the basis of the specific words it contains. It then groups similar topics allowing to map probability distribution over latent topics

### Term Frequency-Inverse Document Frequency
- Count each words frequency thus converting the text format to a numerical one





## Importing LDA and Tokenizer Specific Packages

## Using nltk and spaCy


In [None]:
#pip install stop-words

In [18]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

In [19]:
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

## 1) dividing text into sentences


In [20]:
#Tokenize

# def sent_to_words(sentences):
#     for sentence in sentences:
#             sentence = sentence.lower()(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

# data_words = list(sent_to_words(data))
# #print(data_words[:1])


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#nltk.download('punkt')
#nltk.download('stopwords')

#we will not need to process data to take out unimportant words

# Download stopwords dataset
#nltk.download('stopwords')
#nltk.download('punkt')

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    return ' '.join(filtered_tokens)
        

In [21]:
finaldata['Processed_Plot'] = finaldata['Plot'].apply(preprocess_text)

In [22]:
finaldata.head()

Unnamed: 0_level_0,Plot,Processed_Plot
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr...",bartender working saloon serving drinks custom...
Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov...",moon painted smiling face hangs park night you...
The Martyred Presidents,"The film, just over a minute long, is composed...",film minute long composed two shots first girl...
"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...,lasting 61 seconds consisting two shots first ...
Jack and the Beanstalk,The earliest known adaptation of the classic f...,earliest known adaptation classic fairytale fi...


In [23]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [24]:
# Extract the 'Plot' column values from the DataFrame and convert to a list
finaldata = finaldata.Plot.values.tolist()
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(finalda # Convert the list of sentences to a list of tokenized wordsta))
print(data_words[:1])

[['bartender', 'is', 'working', 'at', 'saloon', 'serving', 'drinks', 'to', 'customers', 'after', 'he', 'fills', 'stereotypically', 'irish', 'man', 'bucket', 'with', 'beer', 'carrie', 'nation', 'and', 'her', 'followers', 'burst', 'inside', 'they', 'assault', 'the', 'irish', 'man', 'pulling', 'his', 'hat', 'over', 'his', 'eyes', 'and', 'then', 'dumping', 'the', 'beer', 'over', 'his', 'head', 'the', 'group', 'then', 'begin', 'wrecking', 'the', 'bar', 'smashing', 'the', 'fixtures', 'mirrors', 'and', 'breaking', 'the', 'cash', 'register', 'the', 'bartender', 'then', 'sprays', 'seltzer', 'water', 'in', 'nation', 'face', 'before', 'group', 'of', 'policemen', 'appear', 'and', 'order', 'everybody', 'to', 'leave']]


In [25]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".joinent)) 
        texts_out.append(" ".join([token.lemma_ if        # Extract lemmatized forms, excluding pronouns, and considering specified POS tags token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out # Initialize an empty list to store lemmatized te #returns the list of lemmatized textsxts


OSError: [E941] Can't find model 'en'. It looks like you're trying to load a model from a shortcut, which is obsolete as of spaCy v3.0. To load the model, use its full name instead:

nlp = spacy.load("en_core_web_sm")

For more details on the available models, see the models directory: https://spacy.io/models. If you want to create a blank model, use spacy.blank: nlp = spacy.blank("en")

In [27]:
## Document word matrix 

vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,
# minimum reqd occurences of a word 
                             stop_words='english',             
# remove stop words
                             lowercase=True,                   
# convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  
# num chars > 3
                             # max_features=50000,             
# max number of uniq words    )
data_vectorized = vectorizer.fit_transform(data_lemmatized)

SyntaxError: incomplete input (356471259.py, line 14)

In [None]:
## Replace newlines with spaces

#text = .replace("\n", " ")

#doc = nlp(finaldata['Processed_Plot'])
#sentences = [sentence.text for sentence in doc.sents]

In [None]:
# Initialize the spaCy engine
nlp = spacy.load("en_core_web_sm")

# Create empty lists to store results
all_words = []
all_pos = []

# Iterate over each row in the DataFrame
for index, row in finaldata.iterrows():
    # Read in the book text from the 'processed_plot' column
    text = row['Processed_Plot']

    # Replace newlines with spaces
    #text = text.replace("\n", " ")

    # Use the spaCy engine to process the text
    doc = nlp(text)

    # Get the list of tuples with words and parts of speech tags
    words = [token.text for token in doc]
    pos = [token.pos_ for token in doc]

    # Append the results to the lists
    all_words.append(words)
    all_pos.append(pos)

# Add the lists of words and parts of speech as new columns to the DataFrame
finaldata['words'] = all_words
finaldata['pos_tags'] = all_pos

# Display the DataFrame
print(finaldata)


In [None]:
# Import the NLTK Snowball Stemmer:
from nltk.stem.snowball import SnowballStemmer

# Initialize stemmer with English:
stemmer = SnowballStemmer('english')

# Initialize a list with words to stem:
words

# Stem the words:

for word in words:
    stemmed_words = [stemmer.stem(words)]



In [None]:
## Combine parts of speech tagging and lemmatization



