In [21]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import matplotlib.pyplot as plt
import regex as re
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from langdetect import detect, DetectorFactory, LangDetectException
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stuti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('polarity_subjectivity_data_without_LemmStemm.csv')
df.head(1)

Unnamed: 0.1,Unnamed: 0,URL,transcript,Comedian Name,word_count,Unique ID,neg_polarity,neu_polarity,pos_polarity,compound,subjectivity
0,0,https://scrapsfromtheloft.com/comedy/pete-holm...,\r\n \r\n [audience cheering and applauding] \...,Pete Holmes,10068,0,0.056,0.688,0.256,1.0,0.504545


In [4]:
# Removing punctuations from transcript
df['transcript'] = df['transcript'].str.replace(f'[{string.punctuation}]', '', regex = True)

# Removing special characters
df['transcript'] = df['transcript'].apply(lambda x: re.sub('[^\w\s]', '', x))

# Decapitalizing text
df['transcript'] = df['transcript'].apply(lambda x: x.lower())

In [5]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,URL,transcript,Comedian Name,word_count,Unique ID,neg_polarity,neu_polarity,pos_polarity,compound,subjectivity
0,0,https://scrapsfromtheloft.com/comedy/pete-holm...,\r\n \r\n audience cheering and applauding \r\...,Pete Holmes,10068,0,0.056,0.688,0.256,1.0,0.504545


In [6]:
# Eliminating stop words
stop_words = stopwords.words('english')

df['tokens_lst'] = df['transcript'].apply(lambda x: [w for w in word_tokenize(x) if not w.lower() in stop_words])

In [7]:
# Counting most frequently ocurring words in the corpus
txt = df['tokens_lst'].apply(lambda x: ' '.join(x))
tokens = [w for lst in txt.apply(word_tokenize) for w in lst]
word_cnt = FreqDist(tokens)

In [8]:
# Map each word in list to adjective, nouns, verbs, adverbs
allowed_tag = ['NN', 'NNS', 'NNP', 'NNPS',     
               'RB', 'RBR', 'RBS', 
               'JJ', 'JJR', 'JJS',
               'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

token_tag_lst = df['tokens_lst'].apply(lambda x: pos_tag(x))
df['all_tokens'] = token_tag_lst

df['select_tokens'] = df['all_tokens'].apply(lambda x: [word for word, tag in x if tag in allowed_tag])

In [9]:
df['diff'] = df.apply(lambda r: len(r['tokens_lst']) - len(r['select_tokens']), axis=1)
df.head(3)

Unnamed: 0.1,Unnamed: 0,URL,transcript,Comedian Name,word_count,Unique ID,neg_polarity,neu_polarity,pos_polarity,compound,subjectivity,tokens_lst,all_tokens,select_tokens,diff
0,0,https://scrapsfromtheloft.com/comedy/pete-holm...,\r\n \r\n audience cheering and applauding \r\...,Pete Holmes,10068,0,0.056,0.688,0.256,1.0,0.504545,"[audience, cheering, applauding, hello, hello,...","[(audience, NN), (cheering, VBG), (applauding,...","[audience, cheering, applauding, hello, hello,...",553
1,1,https://scrapsfromtheloft.com/comedy/jeff-dunh...,\r\n \r\n im funnier than he is but they told ...,Jeff Dunham,4943,1,0.062,0.673,0.264,1.0,0.535705,"[im, funnier, told, introduce, heres, jeff, du...","[(im, NN), (funnier, NN), (told, VBD), (introd...","[im, funnier, told, introduce, heres, jeff, du...",168
2,2,https://scrapsfromtheloft.com/comedy/taylor-to...,\r\n \r\n in her 2024 netflix standup comedy s...,Taylor Tomlinson,11197,2,0.077,0.717,0.206,1.0,0.510567,"[2024, netflix, standup, comedy, special, tayl...","[(2024, CD), (netflix, JJ), (standup, NN), (co...","[netflix, standup, comedy, special, taylor, to...",565


In [10]:
#df.to_csv('preprocess_final.csv')

In [33]:
# Lemmatizing the final tokens column

lem = WordNetLemmatizer()

def lem_tokens(tokens):
    return [lem.lemmatize(token) for token in tokens]

df['lem_tokens'] = df['select_tokens'].apply(lambda x: lem_tokens(x))

In [34]:
DetectorFactory.seed = 0

def lang_detect(transcripts):
    try:
        return detect(transcripts)
    except LangDetectException:
        return None

df['language'] = df['transcript'].apply(lang_detect)

df = df[df['language'] == 'en']

df['lem_txt'] = df['lem_tokens'].apply(lambda x: ' '.join(x)) 

#### Latent Dirichlet Allocation - Nouns

In [30]:
# Map each word in list to adjective, nouns, verbs, adverbs
noun_tag = ['NN', 'NNP']

#token_tag_lst = df['tokens_lst'].apply(lambda x: pos_tag(x))
#df['all_tokens'] = token_tag_lst

df['select_tokens'] = df['all_tokens'].apply(lambda x: [word for word, tag in x if tag in noun_tag])

In [32]:
len(df['select_tokens'][0])

1780

In [35]:
#df.drop(['Unnamed: 0'], axis = 1, inplace = True)

tf_idf = TfidfVectorizer()
tf_mat = tf_idf.fit_transform(df['lem_txt'])

num_topics = 5

lda = LatentDirichletAllocation(n_components = num_topics, random_state = 0)
lda.fit(tf_mat)

def topics(model, vect, top_n = 10):
    for idx, topic in enumerate(model.components_):
        print('Topic {}:'.format(idx))
        top_words_with_scores = [(word, score) for word, score in zip(vect.get_feature_names_out()[topic.argsort()[:-top_n - 1:-1]], topic[topic.argsort()[:-top_n - 1:-1]])]
        print(top_words_with_scores)

In [36]:
topics(lda, tf_idf)


Topic 0:
[('ryanhamiltonlivecom', 0.20004714002034601), ('ohhs', 0.20004714002034601), ('baffle', 0.20004714002034601), ('tadaaa', 0.200046617220609), ('handheld', 0.200046617220609), ('mmmhmm', 0.200046617220609), ('amd', 0.200046617220609), ('awwwww', 0.20004096942679275), ('downfield', 0.2000404158521646), ('pastoral', 0.2000404158521646)]
Topic 1:
[('ryanhamiltonlivecom', 0.20004713999600698), ('ohhs', 0.20004713999600698), ('baffle', 0.20004713999600698), ('tadaaa', 0.20004661684525069), ('handheld', 0.20004661684525069), ('mmmhmm', 0.20004661684525069), ('amd', 0.20004661684525069), ('awwwww', 0.200040969096327), ('downfield', 0.200040415801951), ('pastoral', 0.200040415801951)]
Topic 2:
[('im', 75.17536740931945), ('youre', 61.15024014450721), ('dont', 60.36569648753485), ('audience', 49.11723363265406), ('time', 40.22740031112249), ('man', 39.916950464927446), ('laughter', 39.47992291138559), ('gon', 39.15323933621652), ('cause', 32.0602567743328), ('thing', 30.15025554570044)]