# Objective: using random word inputs, predict which South Park character is speaking from a list of top characters

### Data source: https://www.kaggle.com/tovarischsukhov/southparklines

## Import libraries

In [1]:
import numpy as np
import matplotlib as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import make_pipeline

---

## Import dataset

In [2]:
South_Park_raw = pd.read_csv('All-seasons.csv')
South_Park_raw.describe()


Unnamed: 0,Season,Episode,Character,Line
count,70896,70896,70896,70896
unique,19,19,3950,64301
top,2,10,Cartman,What?\n
freq,6416,5271,9774,361


In [3]:
# Head and shape of dataset
print(South_Park_raw.head())
print(South_Park_raw.shape)

  Season Episode Character                                               Line
0     10       1      Stan         You guys, you guys! Chef is going away. \n
1     10       1      Kyle                        Going away? For how long?\n
2     10       1      Stan                                         Forever.\n
3     10       1      Chef                                  I'm sorry boys.\n
4     10       1      Stan  Chef said he's been bored, so he joining a gro...
(70896, 4)


In [4]:
print (South_Park_raw.describe())

       Season Episode Character     Line
count   70896   70896     70896    70896
unique     19      19      3950    64301
top         2      10   Cartman  What?\n
freq     6416    5271      9774      361


In [5]:
#Select just speakers with more than 500 lines

top_speakers = South_Park_raw.groupby(['Character']).size().loc[South_Park_raw.groupby(['Character']).size() > 500]
print (top_speakers.sort_values(ascending=False))

#Select rows top speakers   
""" This is the dataset we will be working with"""

main_char_lines = pd.DataFrame(South_Park_raw.loc[South_Park_raw['Character'].isin(top_speakers.index.values)])
del main_char_lines['Season']
del main_char_lines['Episode']

main_char_lines = main_char_lines.reset_index(drop=True)

print (main_char_lines.describe())


Character
Cartman         9774
Stan            7680
Kyle            7099
Butters         2602
Randy           2467
Mr. Garrison    1002
Chef             917
Kenny            881
Sharon           862
Mr. Mackey       633
Gerald           626
Jimmy            597
Wendy            585
Liane            582
Sheila           566
Jimbo            556
dtype: int64
       Character     Line
count      37429    37429
unique        16    34196
top      Cartman  What?\n
freq        9774      237


---

## Define train and test datasets

In [6]:
# define X and y
X = main_char_lines.Line
y = main_char_lines.Character

#print (y.value_counts(normalize=True))

# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

---

## Search for best parameters to use in model

In [7]:
#pipe = make_pipeline(TfidfVectorizer(), MultinomialNB())
#pipe.steps

#param_grid = {}
#param_grid["tfidfvectorizer__max_features"] = [500, 1000, 15000]
#param_grid["tfidfvectorizer__ngram_range"] = [(1,1), (1,2), (2,2)]
#param_grid["tfidfvectorizer__lowercase"] = [True, False]
#param_grid["tfidfvectorizer__stop_words"] = ["english", None]
#param_grid["tfidfvectorizer__strip_accents"] = ["ascii", "unicode", None]
#param_grid["tfidfvectorizer__analyzer"] = ["word", "char"]
#param_grid["tfidfvectorizer__binary"] = [True, False]
#param_grid["tfidfvectorizer__norm"] = ["l1", "l2", None]
#param_grid["tfidfvectorizer__use_idf"] = [True, False]
#param_grid["tfidfvectorizer__smooth_idf"] = [True, False]
#param_grid["tfidfvectorizer__sublinear_tf"] = [True, False]

#grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')

#Helpful for understanding how to create your param grid.
#grid.get_params().keys()

#### (This can take a while to run)

In [8]:
#grid.fit(X,y)

In [9]:
#print(grid.best_params_)
#print(grid.best_score_)

## Define Model

In [10]:
vect = TfidfVectorizer(analyzer='word', stop_words='english', max_features = 850, ngram_range=(1, 1), 
                       binary=False, lowercase=True, norm=None, smooth_idf=True, strip_accents=None,
                       sublinear_tf=True, use_idf=False)

mcl_transformed = vect.fit_transform(X)

nb_SP_Model = MultinomialNB()
nb_SP_Model.fit(mcl_transformed, y)
print ("Model accuracy within dataset: ", nb_SP_Model.score(mcl_transformed, y))

Model accuracy within dataset:  0.40530070266370993


In [11]:
print ("Model accuracy with cross validation:", cross_val_score(MultinomialNB(), mcl_transformed.toarray(), 
                                                                y, cv=5, scoring="accuracy").mean())


Model accuracy with cross validation: 0.3333783091995065


---

## Test Model

In [40]:
# Predict on new text
new_text = ["I like Stan"]
new_text_transform = vect.transform(new_text)

print (nb_SP_Model.predict(new_text_transform)," most likely said it.")

['Kyle']  most likely said it.


##### Table with Characters' Line likelihood

In [41]:
SP_prob=pd.DataFrame(nb_SP_Model.predict_proba(new_text_transform))
SP_prob=pd.DataFrame.transpose(SP_prob)
SP_prob.columns = ['Likelihood']

top_speakers_index = top_speakers.reset_index()
top_speakers_index.columns = ['Character', 'Lines']
top_speakers_index = top_speakers_index.drop('Lines', 1)

Result = pd.concat([top_speakers_index, SP_prob], axis=1)

print (Result.sort_values('Likelihood',ascending=False))

       Character  Likelihood
7           Kyle    0.310769
1        Cartman    0.216608
11         Randy    0.204456
0        Butters    0.069615
15         Wendy    0.061166
12        Sharon    0.037349
14          Stan    0.031404
2           Chef    0.014917
5          Jimmy    0.013817
10    Mr. Mackey    0.008807
6          Kenny    0.008302
9   Mr. Garrison    0.008297
4          Jimbo    0.006116
3         Gerald    0.005636
8          Liane    0.001397
13        Sheila    0.001343


In [14]:
mcl_transformed

<37429x850 sparse matrix of type '<class 'numpy.float64'>'
	with 130687 stored elements in Compressed Sparse Row format>

---

## Calculate "spamminess" for the top 3 characters: Cartman, Stan and Kyle
### Used to test common words pertaining to these characters more than to others

#### Calculate "spaminess" for Cartman with detailed coding


In [15]:
cartman = pd.DataFrame(South_Park_raw.loc[South_Park_raw['Character'].isin(top_speakers.index.values)])
del cartman['Season']
del cartman['Episode']

cartman.Character[cartman.Character != 'Cartman'] = 'Not Cartman'
cartman.Character[cartman.Character == 'Cartman'] = 'Cartman'
print (cartman)


         Character                                               Line
0      Not Cartman         You guys, you guys! Chef is going away. \n
1      Not Cartman                        Going away? For how long?\n
2      Not Cartman                                         Forever.\n
3      Not Cartman                                  I'm sorry boys.\n
4      Not Cartman  Chef said he's been bored, so he joining a gro...
5      Not Cartman                                             Wow!\n
7      Not Cartman     What's the meaning of life? Why are we here?\n
9          Cartman  I'm gonna miss him.  I'm gonna miss Chef and I...
10     Not Cartman  Dude, how are we gonna go on? Chef was our fuh...
12     Not Cartman                                         Bye-bye!\n
13     Not Cartman                                        Good-bye!\n
14     Not Cartman                                         So long!\n
17     Not Cartman  Good-bye, Chef! Have a great time with the Sup...
18     Not Cartman  

In [16]:
cartman.Character.value_counts(normalize=True)

Not Cartman    0.738866
Cartman        0.261134
Name: Character, dtype: float64

In [17]:
X_cartman = cartman.Line
y_cartman = cartman.Character
vect_cartman =CountVectorizer(stop_words='english')
Xdtm_cartman = vect_cartman.fit_transform(X_cartman)
nb_cartman = MultinomialNB()
nb_cartman.fit(Xdtm_cartman,y_cartman)
nb_cartman.score(Xdtm_cartman,y_cartman)

0.8318950546367789

In [18]:
tokens_cartman = vect_cartman.get_feature_names()
len(tokens_cartman)

15271

In [19]:
print (vect_cartman.get_feature_names()[:50])

['000', '10', '100', '1000', '102', '104', '105', '106', '10th', '11', '12', '12mm', '12th', '13', '1340s', '13th', '14', '1421', '15', '16', '160', '1621', '167', '17', '1776', '18', '182', '19', '1924', '1956', '1960s', '1972', '1973', '1984', '1999', '20', '200', '2000', '2001', '2004', '2008you', '2009', '2010', '2012', '203', '21', '212', '213', '214', '22']


In [20]:
nb_cartman.feature_count_

array([[ 3., 19.,  0., ...,  0.,  0.,  0.],
       [13., 21.,  5., ...,  1.,  1.,  2.]])

In [21]:
nb_cartman.feature_count_.shape

(2, 15271)

In [22]:
token_count_cartman= nb_cartman.feature_count_[0,:]
token_count_cartman

array([ 3., 19.,  0., ...,  0.,  0.,  0.])

In [23]:
token_count_not_cartman = nb_cartman.feature_count_[1, :]
token_count_not_cartman

array([13., 21.,  5., ...,  1.,  1.,  2.])

In [24]:
# create a DataFrame of tokens with their separate Not-Cartman and Cartman counts
cartman_tokens = pd.DataFrame({'token':tokens_cartman, 'Cartman':token_count_cartman, 'Not_Cartman':token_count_not_cartman}).set_index('token')
cartman_tokens.sample(10, random_state=3)

Unnamed: 0_level_0,Cartman,Not_Cartman
token,Unnamed: 1_level_1,Unnamed: 2_level_1
programming,0.0,1.0
braved,0.0,1.0
exploit,2.0,2.0
poor,92.0,65.0
awwwwrrr,1.0,0.0
davin,4.0,0.0
slots,0.0,2.0
rl,1.0,0.0
babysitters,1.0,1.0
reporter,0.0,3.0


In [25]:
# add 1 to Cartman and Not Cartman counts to avoid dividing by 0
cartman_tokens['Cartman'] = cartman_tokens.Cartman + 1
cartman_tokens['Not_Cartman'] = cartman_tokens.Not_Cartman + 1
cartman_tokens.sample(10, random_state=3)

Unnamed: 0_level_0,Cartman,Not_Cartman
token,Unnamed: 1_level_1,Unnamed: 2_level_1
programming,1.0,2.0
braved,1.0,2.0
exploit,3.0,3.0
poor,93.0,66.0
awwwwrrr,2.0,1.0
davin,5.0,1.0
slots,1.0,3.0
rl,2.0,1.0
babysitters,2.0,2.0
reporter,1.0,4.0


In [26]:
# Naive Bayes counts the number of observations in each class
nb_cartman.class_count_

array([ 9774., 27655.])

In [27]:
# convert the Cartman and Not Cartman counts into frequencies
cartman_tokens['Cartman'] = cartman_tokens.Cartman / nb_cartman.class_count_[0]
cartman_tokens['Not_Cartman'] = cartman_tokens.Not_Cartman / nb_cartman.class_count_[1]
cartman_tokens.sample(10, random_state=3)

Unnamed: 0_level_0,Cartman,Not_Cartman
token,Unnamed: 1_level_1,Unnamed: 2_level_1
programming,0.000102,7.2e-05
braved,0.000102,7.2e-05
exploit,0.000307,0.000108
poor,0.009515,0.002387
awwwwrrr,0.000205,3.6e-05
davin,0.000512,3.6e-05
slots,0.000102,0.000108
rl,0.000205,3.6e-05
babysitters,0.000205,7.2e-05
reporter,0.000102,0.000145


In [28]:
# calculate the ratio of Cartman-to-Not_Cartman for each token
cartman_tokens['spam_ratio'] = cartman_tokens.Cartman / cartman_tokens.Not_Cartman
cartman_tokens.sample(10, random_state=3)

Unnamed: 0_level_0,Cartman,Not_Cartman,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
programming,0.000102,7.2e-05,1.414723
braved,0.000102,7.2e-05,1.414723
exploit,0.000307,0.000108,2.829445
poor,0.009515,0.002387,3.986946
awwwwrrr,0.000205,3.6e-05,5.658891
davin,0.000512,3.6e-05,14.147227
slots,0.000102,0.000108,0.943148
rl,0.000205,3.6e-05,5.658891
babysitters,0.000205,7.2e-05,2.829445
reporter,0.000102,0.000145,0.707361


In [29]:
# examine the DataFrame sorted by spam_ratio
cartman_tokens.sort_values('spam_ratio', ascending=False).head(10)

Unnamed: 0_level_0,Cartman,Not_Cartman,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nyah,0.002455,3.6e-05,67.906691
kewl,0.002455,3.6e-05,67.906691
wicky,0.002149,3.6e-05,59.418355
sail,0.002046,3.6e-05,56.588909
polly,0.001944,3.6e-05,53.759464
tunh,0.001637,3.6e-05,45.271127
oink,0.001637,3.6e-05,45.271127
sucky,0.001432,3.6e-05,39.612237
smurf,0.00133,3.6e-05,36.782791
cartmaaanbrah,0.00133,3.6e-05,36.782791


In [30]:
#Try looking up scores of different words
word = "nyah"
cartman_tokens.loc[word, 'spam_ratio']

67.90669122160834

#### "Spamminess" for Stan

In [31]:
stan = pd.DataFrame(South_Park_raw.loc[South_Park_raw['Character'].isin(top_speakers.index.values)])
del stan['Season']
del stan['Episode']

stan.Character[stan.Character != 'Stan'] = 'Not Stan'
stan.Character[stan.Character == 'Stan'] = 'Stan'

X_stan = stan.Line
y_stan = stan.Character
vect_stan =CountVectorizer(stop_words='english')
Xdtm_stan = vect_stan.fit_transform(X_stan)
nb_stan = MultinomialNB()
nb_stan.fit(Xdtm_stan,y_stan)
nb_stan.score(Xdtm_stan,y_stan)

tokens_stan = vect_stan.get_feature_names()

token_count_stan= nb_stan.feature_count_[0,:]
token_count_not_stan = nb_stan.feature_count_[1, :]

stan_tokens = pd.DataFrame({'token':tokens_stan, 'Stan':token_count_stan, 'Not_Stan':token_count_not_stan}).set_index('token')

stan_tokens['Stan'] = stan_tokens.Stan + 1
stan_tokens['Not_Stan'] = stan_tokens.Not_Stan + 1

stan_tokens['Stan'] = stan_tokens.Stan / nb_stan.class_count_[0]
stan_tokens['Not_Stan'] = stan_tokens.Not_Stan / nb_stan.class_count_[1]

stan_tokens['spam_ratio'] = stan_tokens.Stan / stan_tokens.Not_Stan

# examine the DataFrame sorted by spam_ratio
stan_tokens.sort_values('spam_ratio', ascending=False)

Unnamed: 0_level_0,Stan,Not_Stan,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
randy,0.006824,0.000130,52.406467
sharon,0.005345,0.000130,41.047430
fellas,0.004639,0.000130,35.626071
eric,0.016975,0.000521,32.592692
children,0.014521,0.000521,27.881273
stanley,0.007093,0.000260,27.235873
dreidel,0.003395,0.000130,26.074154
jews,0.002756,0.000130,21.169115
kitty,0.002387,0.000130,18.329356
awesom,0.002219,0.000130,17.038556


#### "Spamminess" for Kyle

In [32]:
kyle = pd.DataFrame(South_Park_raw.loc[South_Park_raw['Character'].isin(top_speakers.index.values)])
del kyle['Season']
del kyle['Episode']

kyle.Character[kyle.Character != 'Kyle'] = 'Not Kyle'
kyle.Character[kyle.Character == 'Kyle'] = 'Kyle'

X_kyle = kyle.Line
y_kyle = kyle.Character

vect_kyle = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 1), 
                       binary=False, lowercase=True, norm=None, smooth_idf=True, strip_accents=None,
                       sublinear_tf=True, use_idf=False)

#vect_kyle =CountVectorizer(stop_words='english')
Xdtm_kyle = vect_kyle.fit_transform(X_kyle)
nb_kyle = MultinomialNB()
nb_kyle.fit(Xdtm_kyle,y_kyle)
nb_kyle.score(Xdtm_kyle,y_kyle)

tokens_kyle = vect_kyle.get_feature_names()

token_count_kyle= nb_kyle.feature_count_[0,:]
token_count_not_kyle = nb_kyle.feature_count_[1, :]

kyle_tokens = pd.DataFrame({'token':tokens_kyle, 'Kyle':token_count_kyle, 'Not_Kyle':token_count_not_kyle}).set_index('token')

kyle_tokens['Kyle'] = kyle_tokens.Kyle + 1
kyle_tokens['Not_Kyle'] = kyle_tokens.Not_Kyle + 1

kyle_tokens['Kyle'] = kyle_tokens.Kyle / nb_kyle.class_count_[0]
kyle_tokens['Not_Kyle'] = kyle_tokens.Not_Kyle / nb_kyle.class_count_[1]

kyle_tokens['spam_ratio'] = kyle_tokens.Kyle / kyle_tokens.Not_Kyle

# examine the DataFrame sorted by spam_ratio
kyle_tokens.sort_values('spam_ratio', ascending=False).head(10)

Unnamed: 0_level_0,Kyle,Not_Kyle,spam_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
jambu,0.001391,3.3e-05,42.174042
willzy,0.00207,6.6e-05,31.387742
gabba,0.000856,3.3e-05,25.974005
lowered,0.000802,3.3e-05,24.323588
dreidel,0.008163,0.000375,21.743894
bother,0.000704,3.3e-05,21.362164
fruity,0.000704,3.3e-05,21.362164
mick,0.000704,3.3e-05,21.362164
ships,0.000704,3.3e-05,21.362164
clay,0.001929,9.9e-05,19.501017


---

### Word Clouds

from wordcloud import WordCloud

In [33]:
tokens = vect.get_feature_names()

token_count= nb_SP_Model.feature_count_[0,:]

All_tokens = pd.DataFrame({'Token':tokens, 'Token_Count':token_count}).set_index('Token')
All_tokens.sort(columns='Token_Count', axis=0, ascending=False)

AttributeError: 'DataFrame' object has no attribute 'sort'