# Part 4 - Create a model to implement in Streamlit

1. Preprocessing?
1. Build a Corex Model
1. Test the Model with example sentences



In [1]:
import numpy as np
import re
import string
import nltk
import sklearn
from nltk.tokenize import MWETokenizer 
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
pd.set_option('display.max_rows', 500)

from corextopic import corextopic
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

import scipy.sparse as ss

In [137]:
df = pd.read_pickle('initial_test_dataframe.pkl')


In [139]:
#This one is on my data page
df['season_number'] = df['season'].apply(lambda x: int(x[:2]))
df['episode_number'] = df['season'].apply(lambda x: int(x[-2:]))
df = df[df['season_number']!=2]

In [84]:
test_df = df.copy()

==============================================================================================================

## 1. Preprocessing

#### A. Remove words within parenthesis which indicate stage directions

In [85]:
parens = lambda x: re.sub("[\(\[].*?[\)\]]", "", x)
test_df['dial_clean'] = test_df['dialogue'].map(parens)


#### B. Remove any lines where the word count is = 1

In [86]:
test_df = test_df[test_df['sentence_length']> 1]

#### C. Remove the main character names from the dialogue, then append the speaker to the dialogue so we know who said what. 

In [87]:
characters = list(test_df['speaker'].unique())
def remove_name(sentence):
    sentence = re.sub('[%s]' % re.escape(string.punctuation), ' ', sentence)
    words = sentence.split(' ')
    sent_clean = ''
    for word in words:
        if word not in characters:
            sent_clean += word + ' '
    return sent_clean

In [88]:
test_df['dial_clean'] = test_df['dial_clean'].apply(lambda x: remove_name(x))


In [89]:
#This will be used for the Word Cloud
test_df_char = test_df.copy()
test_df_char['dial_clean'] = test_df['speaker'] + ' ' + test_df['dial_clean']

#### D. Remove some choice stop words - Hold off on this for now

In [90]:
stop_word = stopwords.words('english')
accepted_words = ['i','me','my','myself','we','our','you','he','him','his','she','her',
 'they','them','what','which','who','whom','but','because','against','when','where','why','how',
 'no','nor','not']
final_stops = [x for x in stop_word if x not in accepted_words]
print(final_stops)

['ours', 'ourselves', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'himself', "she's", 'hers', 'herself', 'it', "it's", 'its', 'itself', 'their', 'theirs', 'themselves', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doe

#### E. Remove some punctuation, symbols, and lowercase

In [91]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x) # Removes any non-alpha-numeric thing
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower()) #Lowercases everything and removes punctuation

test_df['dial_clean'] = test_df['dial_clean'].map(alphanumeric).map(punc_lower)
test_df.reset_index(inplace=True)

#### F. Output the final DataFrame to a pickle file

In [132]:
test_df_char.drop(columns=['before_speaker', 'after_speaker', 'season'], inplace=True)

In [136]:
test_df_char.sample(2)

Unnamed: 0,episode,speaker,dialogue,writers,sentence_length,season_number,episode_number,dial_clean
496,The One Where Rachel Has A Baby,Phoebe,"I just can’t decide who she looks more alike,...",Scott Silveri,12,8,23,Phoebe I just can’t decide who she looks more...
95,The One With the Princess Leia Fantasy,Rachel,"You really, really need to get some sleep, ho...",Michael Curtis and Gregory S. Malins,9,3,1,Rachel You really really need to get some sl...


In [128]:
test_df_char.to_pickle('streamlit_clean_dataframe.pkl')

===================================================================================================

# 2. Character Based Analysis

#### A. Build the common functions

In [93]:
def vectorize(dataframe, column, vectorizer, stops, min_df = 0, max_df = 1.0, n_grams = (1, 1)):
    if vectorizer == "cv":
        vec = CountVectorizer(stop_words=stops, binary=True, min_df = min_df, max_df = max_df, ngram_range=n_grams)
    elif vectorizer == "tfidf":
        vec = TfidfVectorizer(stop_words=stops, binary=True, min_df = min_df, max_df = max_df, ngram_range=n_grams)

    
    doc_word = vec.fit_transform(dataframe[column])
    feature_names = vec.get_feature_names()
    id2word = dict((v, k) for k, v in vec.vocabulary_.items())
    
    return doc_word, feature_names, id2word

#### B. Applying the Corex Model to see if I can accurately separate the characters

In [130]:
chars_anchors = [char.lower() for char in characters]

In [131]:
#Here is where I build the model
vec1 = CountVectorizer(stop_words=final_stops, binary=True)
docword1 = vec1.fit_transform(test_df_char['dial_clean'])
featnames1 = vec1.get_feature_names()

model_char = corextopic.Corex(n_hidden=6, words=list(np.asarray(featnames1)), seed=1, max_iter=200)
model_char.fit(docword1, words=list(np.asarray(featnames1)),anchor_strength=20, 
               anchors=['joey', 'phoebe', 'monica', 'chandler', 'ross', 'rachel'])

topics = model_char.get_topics(n_words=10)
print("TC: ", model_char.tc)
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: \n'.format(n) + ', '.join(topic_words))



TC:  52.5659814018552
0: 
joey, hey, man, dude, audition, ya, agent, um, actor, whoa
1: 
phoebe, ooh, frank, cause, ursula, alice, pheebs, okay, mike, sergei
2: 
monica, yeah, pete, sweetie, we, no, wedding, guest, kinda, chip
3: 
chandler, umm, joe, janice, yes, wait, kathy, gym, wow, babe
4: 
ross, uh, carol, ben, emily, marcel, susan, paleontology, hanukkah, sister
5: 
rachel, honey, oh, god, hi, barry, thank, rach, ohh, joshua


In [101]:
char_prediction = pd.DataFrame(model_char.predict_proba(docword1)[0], columns=['topic'+str(i) for i in range(6)])
char_prediction

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5
0,0.999999,0.000001,0.000001,0.000001,0.000001,0.000001
1,0.000001,0.999999,0.000001,0.000001,0.000001,0.000001
2,0.000001,0.999999,0.000001,0.000001,0.000001,0.000001
3,0.000001,0.000001,0.999999,0.000001,0.000001,0.000001
4,0.000001,0.000001,0.000001,0.999999,0.000001,0.000001
...,...,...,...,...,...,...
28231,0.999999,0.000001,0.000001,0.000001,0.000001,0.000001
28232,0.000001,0.999999,0.000001,0.000001,0.000001,0.000001
28233,0.999999,0.000001,0.000001,0.000001,0.000001,0.000001
28234,0.000001,0.999999,0.000001,0.000001,0.000001,0.000001


## 3. Test out a few sample sentences

In [122]:
s1 = "Interesting, I would have to say seafood"
s2 = "How you doin?"
s3 = "I would say being a doctor has given me a great opportunity to save people's lives while also being a student forever because you keep learning."

In [141]:
#Combine the sentences and turn into a dataframe
pred_char_df = pd.DataFrame([s1 + ' '+ s2 + ' '+ s3], columns=['combine_sentence'])
pred_char_df['combine_sentence'] = pred_char_df['combine_sentence'].map(alphanumeric).map(punc_lower)
pred_char_df.reset_index(inplace=True)

pred_char_df

Unnamed: 0,index,combine_sentence
0,0,interesting i would have to say seafood how y...


In [144]:
#Build the Sparse Matrix from the combined sentence
pred_doc = vec1.transform(pred_char_df['combine_sentence'])
pred_doc

<1x10435 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [145]:
#Create a prediction
model_char.predict_proba(pred_doc)

(array([[1.e-06, 1.e-06, 1.e-06, 1.e-06, 1.e-06, 1.e-06]]),
 array([[2.5773616 , 3.09894896, 3.60197019, 3.85048332, 3.60399848,
         4.11179904]]))

In [116]:
np.argmin(model_char.predict_proba(pred_doc)[1])

0