In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import models
from gensim.corpora import Dictionary
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import statsmodels.api as sm
from scipy.stats import chisquare
from collections import Counter
from scipy.optimize import minimize_scalar
import operator
import itertools
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.manifold import MDS
from gensim import corpora, models
import re
import csv

## Netflix Stuff

In [2]:
# Load your data into a Pandas DataFrame
data = pd.read_csv('Netflix_All_Input_vF.csv')

In [3]:
data.dtypes

Unnamed: 0        int64
post_link        object
comment_count     int64
likes_count       int64
caption          object
is_video           bool
image_link       object
dtype: object

In [4]:
# Preprocess your text data (tokenization, remove stopwords, etc.)

data['caption'] = data['caption'].apply(lambda x: str(x) if not isinstance(x, str) else x)

# Join all 'caption' values into a single string
all_reviews_text = ' '.join(data['caption'].tolist())

# Perform text cleaning and tokenization
all_reviews_text = re.sub(r'[^\w\s]', '', all_reviews_text)
all_reviews_text = all_reviews_text.lower()
words = word_tokenize(all_reviews_text)

In [5]:
stop_words = set(stopwords.words('english'))
words = [word for word in words if word.lower() not in stop_words]

In [6]:
stemmer = PorterStemmer()
words = [stemmer.stem(word) for word in words]

In [7]:
freq_dist = FreqDist(words)
freq_dist = sorted(freq_dist.items(), key=operator.itemgetter(1), reverse = True)

In [8]:
# # Vectorize the text data using TF-IDF
# tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
# tfidf_matrix = tfidf_vectorizer.fit_transform(data['caption'])

In [9]:
dictionary = corpora.Dictionary([words])
corpus = [dictionary.doc2bow(words)]

lda_model = models.LdaModel(corpus, num_topics=3, id2word=dictionary)

In [10]:
for topic_id, topic in lda_model.print_topics():
    print(f"Topic {topic_id}: {topic}")

Topic 0: 0.009*"come" + 0.005*"season" + 0.005*"tudum" + 0.005*"one" + 0.005*"netflix" + 0.005*"new" + 0.004*"live" + 0.004*"look" + 0.003*"heart" + 0.003*"cant"
Topic 1: 0.010*"come" + 0.008*"netflix" + 0.006*"new" + 0.006*"one" + 0.006*"season" + 0.006*"love" + 0.006*"onepiecenetflix" + 0.005*"tudum" + 0.005*"look" + 0.005*"live"
Topic 2: 0.009*"come" + 0.007*"new" + 0.007*"netflix" + 0.006*"season" + 0.005*"live" + 0.005*"one" + 0.005*"onepiecenetflix" + 0.005*"tudum" + 0.004*"love" + 0.004*"heart"


In [11]:
for doc in corpus:
    print(lda_model.get_document_topics(doc))

[(1, 0.9620851), (2, 0.037680224)]


## Disney Stuff

In [22]:
# Load your data into a Pandas DataFrame
data = pd.read_csv('DisneyPlus_All_Input_vF.csv')

# Preprocess your text data (tokenization, remove stopwords, etc.)

data['caption'] = data['caption'].apply(lambda x: str(x) if not isinstance(x, str) else x)

# Join all 'caption' values into a single string
all_reviews_text = ' '.join(data['caption'].tolist())

# Perform text cleaning and tokenization
all_reviews_text = re.sub(r'[^\w\s]', '', all_reviews_text)
all_reviews_text = all_reviews_text.lower()
words = word_tokenize(all_reviews_text)

stop_words = set(stopwords.words('english'))
words = [word for word in words if word.lower() not in stop_words]

stemmer = PorterStemmer()
words = [stemmer.stem(word) for word in words]

freq_dist = FreqDist(words)
freq_dist = sorted(freq_dist.items(), key=operator.itemgetter(1), reverse = True)

dictionary = corpora.Dictionary([words])
corpus = [dictionary.doc2bow(words)]

lda_model = models.LdaModel(corpus, num_topics=2, id2word=dictionary)

for topic_id, topic in lda_model.print_topics():
    print(f"Topic {topic_id}: {topic}")

for doc in corpus:
    print(lda_model.get_document_topics(doc))

Topic 0: 0.058*"disneyplu" + 0.039*"stream" + 0.011*"studio" + 0.010*"origin" + 0.009*"new" + 0.008*"seri" + 0.008*"season" + 0.007*"check" + 0.007*"premier" + 0.007*"themandalorian"
Topic 1: 0.058*"disneyplu" + 0.053*"stream" + 0.012*"origin" + 0.010*"studio" + 0.010*"marvel" + 0.008*"seri" + 0.008*"new" + 0.008*"season" + 0.007*"2" + 0.007*"come"
[(0, 0.08641486), (1, 0.9135852)]


In [27]:
import matplotlib.pyplot as plt

# Extract the weights (probabilities) and words for each topic
topic_words = []
for topic in topic:
    topic_words.append([word.split('*')[1].strip('"') for word in topic.split(' + ')])

# Plot the bar chart for each topic
num_topics = len(topics)
num_words = len(topic_words[0])  # Assuming all topics have the same number of words
word_weights = []

for i in range(num_words):
    word_weights.append([float(topic.split('*')[i].split('"')[0]) for topic in topics])

# Set the x-axis labels (words)
words = [word for word in word_weights[0]]

# Plot the bar chart
plt.figure(figsize=(10, 6))
for i in range(num_topics):
    plt.barh(words, word_weights[i], label=f"Topic {i}")

plt.xlabel("Word Weight")
plt.ylabel("Words")
plt.title("Word Distribution in LDA Topics")
plt.legend()
plt.tight_layout()

plt.show()


IndexError: list index out of range