# Textmining Simpson



In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('simpsons.csv')
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [9]:
df['raw_character_text'].value_counts()

Homer Simpson          29782
Marge Simpson          14141
Bart Simpson           13759
Lisa Simpson           11489
C. Montgomery Burns     3162
                       ...  
Beer Vendor                1
U2                         1
Boy Student                1
Homer-ish Husband          1
Wenches                    1
Name: raw_character_text, Length: 6758, dtype: int64

In [10]:
#we only want bart and lisa lines
df_subset = df.loc[df['raw_character_text'].isin(['Bart Simpson', 'Lisa Simpson'])]
df_subset.head(10)


Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?
13,Lisa Simpson,"The train, how like him... traditional, yet en..."
15,Lisa Simpson,"I see he touched you, too."
17,Bart Simpson,"Hey, thanks for your vote, man."
19,Bart Simpson,"Well, you got that right. Thanks for your vote..."
21,Bart Simpson,"Well, don't sweat it. Just so long as a couple..."


In [12]:
from sklearn.feature_extraction.text import CountVectorizer #The CountVectorizer object

text = df['spoken_words'].values.astype('U') #Taking the text from the df. We need to convert it to Unicode

vect = CountVectorizer(stop_words='english') #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text
vect
feature_names = vect.get_feature_names() #Get the words from the vocabulary
print(f"There are {len(feature_names)} words in the vocabulary. A selection: {feature_names[500:520]}")

There are 38778 words in the vocabulary. A selection: ['abreast', 'abridged', 'abridging', 'abroad', 'abs', 'absa', 'absconded', 'absence', 'absent', 'absentee', 'abso', 'absolut', 'absolute', 'absolutely', 'absolution', 'absolve', 'absolved', 'absorb', 'absorbativity', 'absorbed']


In [13]:
docu_feat = vect.transform(text) # make a matrix

In [14]:
print(docu_feat[0:500,0:500])

  (48, 425)	1
  (53, 474)	1
  (289, 468)	1
  (387, 5)	1
  (444, 277)	1
  (486, 401)	1


In [None]:
#Create a regular matrix out of docu_feat, make it into a DataFrame and concatenate it along the columns
#We need to reset the index because otherwise we end up with a bunch of NA's
df_words = pd.concat([df, pd.DataFrame(docu_feat.toarray()).reset_index()], axis=1)
df_words.head(5)