In [20]:
import pandas as pd
import nltk
df = pd.read_csv("princess_corpus.csv", skipinitialspace=True)
df.head()


Unnamed: 0,Disney_Period,Text,Speaker_Status,Movie,Speaker,Year,UTTERANCE_NUMBER
0,EARLY,slave in the magic mirror come from the farthe...,NON-P,Snow White,queen,1937,1
1,EARLY,"what wouldst thou know, my queen ?",NON-P,Snow White,mirror,1937,2
2,EARLY,"magic mirror on the wall, who is the fairest o...",NON-P,Snow White,queen,1937,3
3,EARLY,"famed is thy beauty, majesty. but hold, a love...",NON-P,Snow White,mirror,1937,4
4,EARLY,alas for her ! reveal her name.,NON-P,Snow White,queen,1937,5


In [21]:
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer
from nltk import wordpunct_tokenize
morph = MorphAnalyzer()
stops = set(stopwords.words('english'))


def lemmatize(x):
    if type(x) != str:
        return ""
    text = wordpunct_tokenize(x)
    result = []
    for word in text:
        if word.isalpha():
            nf = morph.parse(word)[0].normal_form
            result.append(nf)
    return " ".join(result)


In [22]:
from pymystem3 import Mystem
m = Mystem()


def parse(x):
    result = []
    ana = nltk.pos_tag(x.split())
    for tagged_token in ana:
        result.append(tagged_token[1])
    return ' '.join(result)


def length(x):
    return len(x)




In [23]:
df['Length'] = (df['Text']).apply(length)
df['Text'] = df['Text'].apply(lemmatize)



In [24]:
df['POS'] = df['Text'].apply(parse)
df


Unnamed: 0,Disney_Period,Text,Speaker_Status,Movie,Speaker,Year,UTTERANCE_NUMBER,Length,POS
0,EARLY,slave in the magic mirror come from the farthe...,NON-P,Snow White,queen,1937,1,125,NN IN DT JJ NN VBN IN DT JJS NN IN NN CC NN NN...
1,EARLY,what wouldst thou know my queen,NON-P,Snow White,mirror,1937,2,35,WP VBZ NN VB PRP$ NN
2,EARLY,magic mirror on the wall who is the fairest on...,NON-P,Snow White,queen,1937,3,58,JJ NN IN DT NN WP VBZ DT JJS CD IN DT
3,EARLY,famed is thy beauty majesty but hold a lovely ...,NON-P,Snow White,mirror,1937,4,130,VBN VBZ JJ NN NN CC VBP DT JJ NN NN VBP NNS VB...
4,EARLY,alas for her reveal her name,NON-P,Snow White,queen,1937,5,32,NN IN PRP$ NN PRP$ NN
...,...,...,...,...,...,...,...,...,...
7743,LATE,we are never closing them again,PRINCESS,Frozen,elsa,2013,984,33,PRP VBP RB VBG PRP RB
7744,LATE,form on anna s boots,PRINCESS,Frozen,elsa,2013,985,21,NN IN NN NN NNS
7745,LATE,what oh elsa they re beautiful but you know i ...,PRINCESS,Frozen,anna,2013,986,60,WP VBZ NN PRP VBP JJ CC PRP VBP JJ VBP JJ NN
7746,LATE,look out reindeer coming through,NON-P,Frozen,kristoff,2013,987,35,VB RP NN VBG IN


In [25]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


In [26]:
period_early = df.loc[df["Disney_Period"] == "EARLY"]
period_early


Unnamed: 0,Disney_Period,Text,Speaker_Status,Movie,Speaker,Year,UTTERANCE_NUMBER,Length,POS
0,EARLY,slave in the magic mirror come from the farthe...,NON-P,Snow White,queen,1937,1,125,NN IN DT JJ NN VBN IN DT JJS NN IN NN CC NN NN...
1,EARLY,what wouldst thou know my queen,NON-P,Snow White,mirror,1937,2,35,WP VBZ NN VB PRP$ NN
2,EARLY,magic mirror on the wall who is the fairest on...,NON-P,Snow White,queen,1937,3,58,JJ NN IN DT NN WP VBZ DT JJS CD IN DT
3,EARLY,famed is thy beauty majesty but hold a lovely ...,NON-P,Snow White,mirror,1937,4,130,VBN VBZ JJ NN NN CC VBP DT JJ NN NN VBP NNS VB...
4,EARLY,alas for her reveal her name,NON-P,Snow White,queen,1937,5,32,NN IN PRP$ NN PRP$ NN
...,...,...,...,...,...,...,...,...,...
1320,EARLY,i know you i walked with you once upon a dream,NON-P,Sleeping Beauty,choir,1959,458,48,NN VBP PRP VBP VBN IN PRP RB IN DT NN
1321,EARLY,blue,NON-P,Sleeping Beauty,merryweather,1959,459,6,NN
1322,EARLY,i know you the gleam in your eyes is so famili...,NON-P,Sleeping Beauty,choir,1959,460,58,NN VBP PRP DT NN IN PRP$ NNS VBZ RB JJ DT NN
1323,EARLY,and i know it s true that visions are seldom a...,NON-P,Sleeping Beauty,choir,1959,461,99,CC NN VBP PRP VBZ JJ IN NNS VBP RB DT PRP VBP ...


In [27]:
tfIdfTransformer = TfidfTransformer(use_idf=True)
countVectorizer = CountVectorizer()
wordCount = countVectorizer.fit_transform(period_early["Text"])
newTfIdf = tfIdfTransformer.fit_transform(wordCount)
dfnew = pd.DataFrame(newTfIdf[0].T.todense(
), index=countVectorizer.get_feature_names(), columns=["TF-IDF"])
dfnew = dfnew.sort_values('TF-IDF', ascending=False)
print(dfnew.head(25))


            TF-IDF
summon    0.271529
farthest  0.271529
darkness  0.271529
slave     0.271529
space     0.271529
wind      0.256843
speak     0.256843
face      0.246423
mirror    0.246423
thy       0.238341
thee      0.238341
magic     0.194017
through   0.194017
from      0.186363
the       0.185610
see       0.162131
let       0.159573
me        0.144343
come      0.142756
in        0.124776
and       0.106590
proper    0.000000
psst      0.000000
prove     0.000000
proud     0.000000




In [28]:
period_mid = df.loc[df["Disney_Period"] == "MID"]
period_mid


Unnamed: 0,Disney_Period,Text,Speaker_Status,Movie,Speaker,Year,UTTERANCE_NUMBER,Length,POS
1325,MID,i ll tell you a tale of the bottomless blue an...,NON-P,The Little Mermaid,sailors,1989,1,157,JJ VBP VBP PRP DT NN IN DT NN NN CC PRP VBZ NN...
1326,MID,isn t this great the salty sea air the wind bl...,PRINCE,The Little Mermaid,eric,1989,2,101,NN NN DT JJ DT NN NN NN DT NN NN IN PRP$ NN DT...
1327,MID,oh yes delightful,NON-P,The Little Mermaid,grimsby,1989,3,32,UH UH JJ
1328,MID,a fine strong wind and a following sea king tr...,NON-P,The Little Mermaid,sailor 1,1989,4,84,DT JJ JJ NN CC DT JJ NN VBG NN MD VB IN DT NN NN
1329,MID,king triton,PRINCE,The Little Mermaid,eric,1989,5,13,VBG NN
...,...,...,...,...,...,...,...,...,...
5475,MID,aaaaahhhhaaaaaaaa whoohoohoohoohoooo,NON-P,Mulan,mushu,1998,550,41,NN NN
5476,MID,take it crickee,NON-P,Mulan,mushu,1998,551,18,VB PRP VB
5477,MID,you know she gets it from my side of the family,NON-P,Mulan,ancestor #2,1998,552,50,PRP VBP PRP VBZ PRP IN PRP$ NN IN DT NN
5478,MID,guardians,NON-P,Mulan,great ancestor,1998,553,11,NNS


In [29]:
tfIdfTransformer = TfidfTransformer(use_idf=True)
countVectorizer = CountVectorizer()
wordCount = countVectorizer.fit_transform(period_mid["Text"])
newTfIdf = tfIdfTransformer.fit_transform(wordCount)
dfnew = pd.DataFrame(newTfIdf[0].T.todense(
), index=countVectorizer.get_feature_names(), columns=["TF-IDF"])
dfnew = dfnew.sort_values('TF-IDF', ascending=False)
print(dfnew.head(25))


              TF-IDF
starboard   0.276724
fathoms     0.263737
waitin      0.263737
below       0.263737
bottomless  0.263737
heave       0.254522
tale        0.247374
blue        0.241534
mermaid     0.241534
mysterious  0.236596
lad         0.225172
ho          0.222119
tell        0.171154
the         0.169859
hey         0.158564
you         0.148811
look        0.146940
out         0.139515
ll          0.130355
for         0.122583
in          0.118782
be          0.116983
of          0.110387
and         0.103814
it          0.095611




In [30]:
period_late = df.loc[df["Disney_Period"] == "LATE"]
period_late


Unnamed: 0,Disney_Period,Text,Speaker_Status,Movie,Speaker,Year,UTTERANCE_NUMBER,Length,POS
5480,LATE,evening star is shining bright so make a wish ...,NON-P,The Princess and the Frog,singing,2009,1,125,VBG NN VBZ VBG RB RB VB DT NN CC VB IN JJ EX J...
5481,LATE,just in that moment the ugly little frog looke...,NON-P,The Princess and the Frog,eudora,2009,2,212,RB IN DT NN DT RB JJ NN VBD RP IN PRP$ JJ NN N...
5482,LATE,there comes my favorite part,NON-P,The Princess and the Frog,charlotte,2009,3,30,RB VBZ PRP$ JJ NN
5483,LATE,and the beautiful princess was so moved by his...,NON-P,The Princess and the Frog,eudora,2009,4,299,CC DT JJ NN VBD RB VBN IN PRP$ JJ NN IN PRP VB...
5484,LATE,yay read it again read it again,NON-P,The Princess and the Frog,charlotte,2009,5,35,RB VB PRP RB VBD PRP RB
...,...,...,...,...,...,...,...,...,...
7743,LATE,we are never closing them again,PRINCESS,Frozen,elsa,2013,984,33,PRP VBP RB VBG PRP RB
7744,LATE,form on anna s boots,PRINCESS,Frozen,elsa,2013,985,21,NN IN NN NN NNS
7745,LATE,what oh elsa they re beautiful but you know i ...,PRINCESS,Frozen,anna,2013,986,60,WP VBZ NN PRP VBP JJ CC PRP VBP JJ VBP JJ NN
7746,LATE,look out reindeer coming through,NON-P,Frozen,kristoff,2013,987,35,VB RP NN VBG IN


In [31]:
tfIdfTransformer = TfidfTransformer(use_idf=True)
countVectorizer = CountVectorizer()
wordCount = countVectorizer.fit_transform(period_late["Text"])
newTfIdf = tfIdfTransformer.fit_transform(wordCount)
dfnew = pd.DataFrame(newTfIdf[0].T.todense(
), index=countVectorizer.get_feature_names(), columns=["TF-IDF"])
dfnew = dfnew.sort_values('TF-IDF', ascending=False)
print(dfnew.head(25))


               TF-IDF
shining      0.307719
tight        0.281169
bright       0.265639
happen       0.259735
air          0.254620
evening      0.254620
star         0.250109
and          0.234727
wish         0.233186
tonight      0.233186
hold         0.228071
magic        0.225749
anything     0.214171
make         0.201522
there        0.161073
so           0.157879
can          0.150578
on           0.145821
in           0.140359
is           0.133904
the          0.101305
prepared     0.000000
practically  0.000000
practicing   0.000000
pre          0.000000


