# Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(color_codes=True)
%config InlineBackend.figure_formats = ['retina']

import string
from wordcloud import WordCloud
from collections import Counter

import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.utils import simple_preprocess

import warnings
warnings.filterwarnings('ignore')

In [None]:
frame=pd.read_csv("frame.csv")

## Stats and KDE plot

In [None]:
# Stats and KDE(Kernel Density Estimation) plot for character count of each transcript
x = [len(x) for x in frame.Transcript]
ax = sns.kdeplot(x, shade=True, color="b")
ax.set_title('Transcript Character Count KDE')
mean = np.array(x).mean()
sd = np.array(x).std()
print(f'Mean: {mean}')
print(f'SD: {sd}')

In [None]:
# Stats and KDE plot for runtime of each performance
x = []
count = 0
for i in frame.runtime:
    if (i > 0):
        count += 1
        x.append(int(i))
ax = sns.kdeplot(x, shade=True, color="r")   
ax.set_title('Runtime KDE')
ax.set(xlabel='minutes')
mean = np.array(x).mean()
sd = np.array(x).std()
print(f'Mean: {mean}')
print(f'SD: {sd}')

In [None]:
# Stats and KDE plot for IMDb rating of each performance
x = []
count = 0
for i in frame.rating:
    if (i > 0):
        count += 1
        x.append(i)
ax = sns.kdeplot(x, shade=True, color="g")   
ax.set_title('IMDb Rating KDE')
mean = np.array(x).mean()
sd = np.array(x).std()
print(f'Mean: {mean}')
print(f'SD: {sd}')

## Rating Type
Give a 1 for any rating above the mean, and a 0 otherwise. This will be our target for a classification .

In [None]:
frame['rating_type'] = frame.rating.apply(lambda x: 1 if x >= frame.rating.mean() else 0)
ax = sns.countplot(x='rating_type', data=frame)
ax.set(xticklabels=['High rating (> mean)', 'Low rating (< mean)'])
ax.set(title='Counts of specials with higher or lower than average ratings')

## Most Common Words

In [None]:
# Form the document-term matrix
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(frame.Transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
data_dtm.index = frame.index
data = data_dtm.transpose()

In [None]:
# top 30 words said by each comedian
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))
top_dict

In [None]:
# most common top words for each comedian
words = []
for comedian in data.columns:
    top = [word for (word, count) in top_dict[comedian]]
    for t in top:
        words.append(t)   
# most common words along with how many routines they occur in
Counter(words).most_common()

In [None]:
# word cloud to visualize the most common words.
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='midnightblue')
print(frame.Title[60])
wordcloud.generate(' '.join(frame.words[60]))
wordcloud.to_image()

## Size of Vocabulary

In [None]:
# Tokenization words with Gensim simple_preprocess method
stop_words = stopwords.words('english')
stop_words.extend(['audience', 'laughter', 'laughing', 'announcer', 'narrator', 'cos']) # Extra words to remove, specific to this dataset

# Tokenize, lowercase, remove punctuation and remove stopwords in one line
frame['words'] = frame.Transcript.apply(lambda x: [word for word in simple_preprocess(x, deacc=True) if word not in stop_words])

# word count
frame['word_count'] = frame.words.apply(lambda x: len(x))

## Amount of Profanity

In [None]:
# Count the number of times an 'F' or 'S' word is used, then remove them
def get_swear_counts(input_list, swear_list):
    swears = 0
    for word in input_list:
        if word.lower()in swear_list:
            swears += 1
    return swears

f_words = ['fuck', 'fucking', 'fckin','fucken','fucked','fck','fcking','fuckin', 'fucker', 'muthafucka', 'motherfuckers', 'motherfucke','motha','motherfucking','motherfuckin','motherfuckers', 'motherfucker']
s_words = ['shit', 'shitter', 'shitting', 'shite', 'bullshit', 'shitty']

frame['f_words'] = frame.words.apply(lambda x: get_swear_counts(x, f_words))
frame['s_words'] = frame.words.apply(lambda x: get_swear_counts(x, s_words))

swears = f_words + s_words + ['cunt', 'asshole', 'damn', 'goddamn', 'cocksucker','sluts','dicks','dick','pussy','ass','asshole','assholes','porn','penis','tit']

frame['words'] = frame.words.apply(lambda x: [word for word in x if word not in swears])

# Feature Engineering
diversity_ratio feature : (diversity of words) / (total words)

In [None]:
frame['diversity'] = frame.words.apply(lambda x: len(set(x)))
frame['diversity_ratio'] = frame.diversity / frame.word_count

# correlations

In [None]:
sns.pairplot(frame[['diversity_ratio', 'diversity', 'word_count', 'runtime', 'rating', 'rating_type']])

In [None]:
# Saving the csv file
frame.to_csv("frame2.csv", index=False)

In [5]:
df=pd.read_csv(r"D:\PROJECTS\transnlp\data\processed\processed_content_data.csv")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   S No.                 500 non-null    int64  
 1   Tag                   500 non-null    object 
 2   URL                   500 non-null    object 
 3   Raw Transcript        500 non-null    object 
 4   Transcript            500 non-null    object 
 5   CleanTag              500 non-null    object 
 6   Year                  465 non-null    float64
 7   Names                 500 non-null    object 
 8   Title                 480 non-null    object 
 9   runtime               434 non-null    float64
 10  rating                425 non-null    float64
 11  language              500 non-null    object 
 12  preprocessed_content  500 non-null    object 
 13  rating_type           425 non-null    object 
 14  f_words               500 non-null    int64  
 15  s_words               5

In [7]:
df.head()

Unnamed: 0,S No.,Tag,URL,Raw Transcript,Transcript,CleanTag,Year,Names,Title,runtime,rating,language,preprocessed_content,rating_type,f_words,s_words,word_count,diversity,diversity_ratio
0,0,Michelle Buteau: Welcome to Buteaupia (2020) ...,https://scrapsfromtheloft.com/comedy/michelle-...,['Michelle Buteau’s Netflix special Welcome to...,michelle buteaus netflix special welcome to bu...,Michelle Buteau: Welcome to Buteaupia (2020),2020.0,Michelle Buteau,Welcome to Buteaupia,58.0,7.0,en,michelle buteaus welcome buteaupia showcase ch...,Above Average,22,24,3222,833,0.258535
1,1,Theo Von: No Offense (2016) | Transcript,https://scrapsfromtheloft.com/comedy/theo-von-...,['Theo Von: No Offense was recorded at the Civ...,theo von no offense was recorded at the civic ...,Theo Von: No Offense (2016),2016.0,Theo Von,No Offense,67.0,5.8,en,theo von offense wa recorded civic theatre orl...,Below Average,37,35,3777,1215,0.321684
2,2,Nate Bargatze’s Nashville Christmas (2024) | T...,https://scrapsfromtheloft.com/comedy/nate-barg...,['Nate Bargatze’s Nashville Christmas is a hea...,nate bargatzes nashville christmas is a heartw...,Nate Bargatze’s Nashville Christmas (2024),2024.0,Nate Bargatze’s,Nashville Christmas,61.0,6.8,en,nate bargatzes christmas heartwarming holiday ...,Below Average,0,0,2451,890,0.363117
3,3,"Your Friend, Nate Bargatze (2024) | Transcript",https://scrapsfromtheloft.com/comedy/your-frie...,"['Your Friend, Nate Bargatze (2024)\nGenre: Co...",your friend nate bargatze comedy standupdirec...,"Your Friend, Nate Bargatze (2024)",2024.0,Nate Bargatze,"Your Friend,",63.0,7.2,en,friend nate bargatze comedy standupdirector po...,Above Average,0,0,2684,755,0.281297
4,4,Ronny Chieng: Love to Hate It (2024) | Transcript,https://scrapsfromtheloft.com/comedy/ronny-chi...,"['[tuning]', '[gentle Hawaiian music playing o...",tuning gentle hawaiian music playing over radi...,Ronny Chieng: Love to Hate It (2024),2024.0,Ronny Chieng,Love to Hate It,65.0,7.1,en,tuning music playing radio revving announcer l...,Above Average,40,14,3640,1197,0.328846
