In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import matplotlib.pyplot as plt
import regex as re
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from langdetect import detect, DetectorFactory, LangDetectException
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stuti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('polarity_subjectivity_data_without_LemmStemm.csv')
df.head(1)

Unnamed: 0.1,Unnamed: 0,URL,transcript,Comedian Name,word_count,Unique ID,neg_polarity,neu_polarity,pos_polarity,compound,subjectivity
0,0,https://scrapsfromtheloft.com/comedy/pete-holm...,\r\n \r\n [audience cheering and applauding] \...,Pete Holmes,10068,0,0.056,0.688,0.256,1.0,0.504545


In [3]:
# Removing punctuations from transcript
df['transcript'] = df['transcript'].str.replace(f'[{string.punctuation}]', '', regex = True)

# Removing special characters
df['transcript'] = df['transcript'].apply(lambda x: re.sub('[^\w\s]', '', x))

# Decapitalizing text
df['transcript'] = df['transcript'].apply(lambda x: x.lower())

In [4]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,URL,transcript,Comedian Name,word_count,Unique ID,neg_polarity,neu_polarity,pos_polarity,compound,subjectivity
0,0,https://scrapsfromtheloft.com/comedy/pete-holm...,\r\n \r\n audience cheering and applauding \r\...,Pete Holmes,10068,0,0.056,0.688,0.256,1.0,0.504545


In [5]:
# Eliminating stop words
stop_words = stopwords.words('english')

df['tokens_lst'] = df['transcript'].apply(lambda x: [w for w in word_tokenize(x) if not w.lower() in stop_words])

In [6]:
# Counting most frequently ocurring words in the corpus
txt = df['tokens_lst'].apply(lambda x: ' '.join(x))
tokens = [w for lst in txt.apply(word_tokenize) for w in lst]
word_cnt = FreqDist(tokens)

In [7]:
# Map each word in list to adjective, nouns, verbs, adverbs
#allowed_tag = ['NN', 'NNS', 'NNP', 'NNPS',     
#               'RB', 'RBR', 'RBS', 
#               'JJ', 'JJR', 'JJS',
#               'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

# Nouns tag list
allowed_tag_nouns = ['NN', 'NNS', 'NNP', 'NNPS']

token_tag_lst = df['tokens_lst'].apply(lambda x: pos_tag(x))
df['all_tokens'] = token_tag_lst

df['select_tokens'] = df['all_tokens'].apply(lambda x: [word for word, tag in x if tag in allowed_tag_nouns])

In [8]:
df['diff'] = df.apply(lambda r: len(r['tokens_lst']) - len(r['select_tokens']), axis=1)
df.head(3)

Unnamed: 0.1,Unnamed: 0,URL,transcript,Comedian Name,word_count,Unique ID,neg_polarity,neu_polarity,pos_polarity,compound,subjectivity,tokens_lst,all_tokens,select_tokens,diff
0,0,https://scrapsfromtheloft.com/comedy/pete-holm...,\r\n \r\n audience cheering and applauding \r\...,Pete Holmes,10068,0,0.056,0.688,0.256,1.0,0.504545,"[audience, cheering, applauding, hello, hello,...","[(audience, NN), (cheering, VBG), (applauding,...","[audience, hello, cheering, applause, thank, g...",3196
1,1,https://scrapsfromtheloft.com/comedy/jeff-dunh...,\r\n \r\n im funnier than he is but they told ...,Jeff Dunham,4943,1,0.062,0.673,0.264,1.0,0.535705,"[im, funnier, told, introduce, heres, jeff, du...","[(im, NN), (funnier, NN), (told, VBD), (introd...","[im, funnier, introduce, heres, upbeat, rock, ...",1485
2,2,https://scrapsfromtheloft.com/comedy/taylor-to...,\r\n \r\n in her 2024 netflix standup comedy s...,Taylor Tomlinson,11197,2,0.077,0.717,0.206,1.0,0.510567,"[2024, netflix, standup, comedy, special, tayl...","[(2024, CD), (netflix, JJ), (standup, NN), (co...","[standup, comedy, taylor, tomlinson, offers, e...",3451


In [9]:
#df.to_csv('preprocess_final.csv')

In [10]:
# Lemmatizing the final tokens column

#lem = WordNetLemmatizer()

#def lem_tokens(tokens):
#    return [lem.lemmatize(token) for token in tokens]

#df['lem_tokens'] = df['select_tokens'].apply(lambda x: lem_tokens(x))

In [11]:
DetectorFactory.seed = 0
DetectorFactory.min_num_of_characters = 50 

def lang_detect(transcripts):
    try:
        return detect(transcripts)
    except LangDetectException:
        return None

df['language'] = df['transcript'].apply(lang_detect)

df = df[df['language'] == 'en']

#### Latent Dirichlet Allocation

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# Text to Numerical Representation
tf_vect = TfidfVectorizer(tokenizer= lambda doc: doc, lowercase = False)
cv_vect = CountVectorizer(tokenizer = lambda doc:doc, lowercase = False)

In [13]:
# Array from vectorizers

tf_arr = tf_vect.fit_transform(df['select_tokens'])
cv_arr = cv_vect.fit_transform(df['select_tokens'])
tf_arr



<464x34404 sparse matrix of type '<class 'numpy.float64'>'
	with 328680 stored elements in Compressed Sparse Row format>

In [14]:
vocab_tf = tf_vect.get_feature_names_out()
vocab_tf

array(['11deniers', '60s', '70s', ..., 'és', 'órale', 'über'],
      dtype=object)

In [15]:
lda = LatentDirichletAllocation(n_components = 10, max_iter = 200, random_state = 0)
topics = lda.fit_transform(tf_arr)
topic_words = lda.components_

In [16]:
import numpy as np

n_words = 11

for i, topic in enumerate(topic_words):
    sorted_topic = np.argsort(topic)

    topic_words = np.array(vocab_tf)[sorted_topic]
    topic_words = topic_words[:-n_words:-1]
    print('Topic', str(i+1), topic_words)

Topic 1 ['drinken' 'sie' 'halfcaste' 'fo' 'mavericks' 'madman' 'kannst' 'vater'
 'mixedbreed' 'helfen']
Topic 2 ['bolsonaro' 'deforestation' 'rainforest' 'guajajara' 'jair' 'brazils'
 'jbs' 'sônia' 'bolsonaros' 'ox']
Topic 3 ['josep' 'filmstar' 'eff' 'loirn' 'sergeant' 'moccasins' 'righthand'
 'birch' 'feather' 'horselaugh']
Topic 4 ['dani' 'jax' 'karam' 'enaam' 'trainees' 'aida' 'abo' 'rocco' 'audette'
 'crotchet']
Topic 5 ['zionism' 'kroeger' 'tambor' 'liberation' 'ceasefire' 'eanna' 'preaches'
 'rath' 'pixels' 'exposes']
Topic 6 ['klause' 'wilhelm' 'propriety' 'arrgggghh' 'eletricity' 'insaaaaane'
 'whoooooo' 'wayyyyy' 'felicity' 'deniro']
Topic 7 ['im' 'thats' 'people' 'dont' 'youre' 'audience' 'laughter' 'time' 'man'
 'gon']
Topic 8 ['gil' 'fatih' 'lailailai' 'halamaha' 'ravi' 'beez' 'bebeez' 'yannis'
 'lailailailai' 'tokhm']
Topic 9 ['griot' 'epidemic' 'artistry' 'thundercat' 'gab' 'architects' 'tame'
 'lathan' 'stan' 'mos']
Topic 10 ['mnah' 'bonanza' 'johansen' 'marrying' 'devel