In [59]:
!pip install nltk

4458.98s - pydevd: Sending message related to process being replaced timed-out after 5 seconds




In [60]:
# Imports
import pandas as pd
import numpy as np
import re
from pathlib import Path

# Graphs
import plotly.express as px
import plotly.graph_objects as go

# NLP
import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  # Ensure this import is present
from nltk.stem import WordNetLemmatizer
import string
from collections import Counter

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Pandas settings
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')

[nltk_data] Downloading package punkt to /home/blackbird/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/blackbird/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/blackbird/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/blackbird/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


---

In [61]:
DATA_DIR = Path("data")
sp_path = DATA_DIR / "south_park_all_seasons.csv"
slurs_path = DATA_DIR / "english_slurs.csv"

df = pd.read_csv(sp_path)
slurs_df = pd.read_csv(slurs_path)

In [62]:
slurs_df.sample(10)

Unnamed: 0,insult
1631,j*ckass
1785,dickface
43,arse-fucker
1086,phucup
1021,nobj0key
52,ashole
594,fook
684,fukkers
209,bum-bandits
1190,scummy


In [63]:
df['Line'] = df['Line'].str.rstrip('\n') # Remove \n

df.head()

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away."
1,10,1,Kyle,Going away? For how long?
2,10,1,Stan,Forever.
3,10,1,Chef,I'm sorry boys.
4,10,1,Stan,"Chef said he's been bored, so he joining a group called the Super Adventure Club."


---

In [64]:
# Group by Character
lines_per_character = df.groupby('Character')['Line'].count().reset_index(name='lines')

top_characters = lines_per_character.nlargest(10, 'lines')

fig = px.bar(top_characters, 
             x='lines', 
             y='Character', 
             orientation='h',
             title='Top 10 Characters with Most Lines',
             labels={'lines': 'Number of Lines', 'Character': 'Character'},
             color='lines',
             color_continuous_scale='Bluered')

fig.show()

---

In [65]:
# Lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Clean text
def tokenize_and_clean(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in stop_words and t not in string.punctuation]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

df['tokens'] = df['Line'].apply(tokenize_and_clean)

df[['Character', 'Line', 'tokens']].head()

Unnamed: 0,Character,Line,tokens
0,Stan,"You guys, you guys! Chef is going away.","[guy, guy, chef, going, away]"
1,Kyle,Going away? For how long?,"[going, away, long]"
2,Stan,Forever.,[forever]
3,Chef,I'm sorry boys.,"['m, sorry, boy]"
4,Stan,"Chef said he's been bored, so he joining a group called the Super Adventure Club.","[chef, said, 's, bored, joining, group, called, super, adventure, club]"


In [67]:
all_tokens = [token for sublist in df['tokens'] for token in sublist]

token_freq = Counter(all_tokens)

top_20_tokens = token_freq.most_common(20)

words, counts = zip(*top_20_tokens)

fig = px.bar(
    x=words, 
    y=counts, 
    labels={'x': 'Words', 'y': 'Frequency'}, 
    title='Top 20 Most Frequent Words in South Park Scripts'
)

fig.show()

In [68]:
# Filter Cartman's dialogues
cartman_lines = df[df['Character'] == 'Cartman']
cartman_lines.sample(10)

Unnamed: 0,Season,Episode,Character,Line,tokens
28316,1,2,Cartman,"Ohh, rainbows. Yeah, I like those, those are cool.","[ohh, rainbow, yeah, like, cool]"
63940,8,1,Cartman,"Very well, Clyde. Kiyaaaa!","[well, clyde, kiyaaaa]"
39211,3,3,Cartman,Ey! Don't call me a little piggy!,"[ey, n't, call, little, piggy]"
61659,7,8,Cartman,Yeah. It's a charcoal-based thing that-,"[yeah, 's, charcoal-based, thing, that-]"
4511,11,4,Cartman,"Yes or no, Kyle?!","[yes, kyle]"
27513,18,8,Cartman,All right McNuggets!,"[right, mcnuggets]"
20623,16,4,Cartman,"Well, I better be going. Thanks so much Ms. Broflovski, I learned a ton.","[well, better, going, thanks, much, ms., broflovski, learned, ton]"
7049,12,1,Cartman,"Butters, helping people who have AIDS is one of the most important things you can do.","[butter, helping, people, aid, one, important, thing]"
15254,14,8,Cartman,"Whoa whoa whoa whoa whoa! Did you hear that?! ""In some cases Vagisil can lead to short-term memory loss."" Oh my God, Butters... we need Vagisil!","[whoa, whoa, whoa, whoa, whoa, hear, ``, case, vagisil, lead, short-term, memory, loss, '', oh, god, butter, ..., need, vagisil]"
69577,9,9,Cartman,"You aren't there to have fun, you black asshole! You were supposed to be getting the future-telling device.","[n't, fun, black, asshole, supposed, getting, future-telling, device]"


In [None]:
insults_list = slurs_df['insult'].str.lower().tolist()

# Count insults in a given list of tokens
def count_insults(tokens, insults_list):
    return [word for word in tokens if word in insults_list]

# Count insults for each of Cartman's lines
cartman_lines.loc[:, 'insults'] = cartman_lines['tokens'].apply(lambda x: count_insults(x, insults_list))

# Display
cartman_lines[['Character', 'Line', 'insults']].sample(20)

Unnamed: 0,Character,Line,insults
56918,Cartman,Just now - 1 2 3 dibs!,[]
65573,Cartman,"Yeh- no, no, wait, wait. I got a better idea you guys. What we should do is we should secretly go around and tell all the students we can to not check any of the mascots on this election sheet, an...","[sheet, turd]"
5189,Cartman,"Eh! Stop pushing, Kenny.",[]
16874,Cartman,"I left my charger at home! Fine! I'm gonna go home and charge my iPad and bring it tomorrow, and you guys are gonna feel really stupid!",[]
8664,Cartman,Nobody is going anywhere until the police arrive!,[]
49066,Cartman,Would somebody put this retard out of his misery?!,[retard]
25023,Cartman,"Sir Stan, I was hoping I could talk to you about the wedding.",[]
40354,Cartman,No. That's okay. I'll see you guys.,[]
35503,Cartman,As if we care.,[]
41709,Cartman,"Oh, you didn't get a special Chinpokomon game controller. T-heh heh heh heh. Jehesus C-hrist!",[]


In [None]:
all_insults = [insult for sublist in cartman_lines['insults'] for insult in sublist]

insult_freq = Counter(all_insults)

top_insults = insult_freq.most_common(20)

insults, counts = zip(*top_insults)

fig = px.bar(
    x=insults, 
    y=counts, 
    labels={'x': 'Insults', 'y': 'Frequency'},
    title="Top Insults Used by Cartman"
)

fig.show()