In [169]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
script_df = pd.read_csv('script.csv')

In [3]:
script_lab = script_df[['cast','dialog']]

In [239]:
#Do the filtering
min_samples = 1000
def filter_characters(df,min_samples):
    """Only allow classification of characters with at least some minimum number of samples."""
    df_cnts = df.groupby('cast').count()
    main_chars = df_cnts[df_cnts['dialog']>=min_samples].index
    return script_lab[script_lab['cast'].isin(main_chars)].copy()

def fix_chars(mySeries): 
    """Remove any chars in the dialogue string which are irrelevant"""
    mySeries = mySeries.str.replace(("\[.*\]"),'')
    return mySeries.apply(str)

def filter_length(df,min_words,max_words): 
    """Only allow samples between a min and max length. """
    num_words = df['dialog'].apply(lambda x: len(x.split()))
    return df[(num_words>=5) & (num_words<=50)]
    
    

In [237]:
script_filt1 = filter_characters(script_lab,min_samples)
script_filt1['dialog'] = fix_chars(script_filt1['dialog'])

min_len = 3 
max_len = 50
script_filt2 = filter_length(script_filt1,min_len,max_len)


In [171]:
# bag of words 
def get_vector_representation(script_df,tfidf=False): 
    """Change a dataframe of cast lines into a matrix representation of those lines, and a matrix representation of who said them."""
    
    corpus = script_df['dialog']
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    
    if tfidf==True: 
        transformer = TfidfTransformer()
        X = transformer.fit_transform(X)
    
    Y = pd.get_dummies(script_df['cast'])
    
    return X,Y 

In [172]:
X,Y = get_vector_representation(script_filt,tfidf=True)

In [184]:
from sklearn.model_selection import train_test_split 
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=.2,random_state=1930)


In [185]:
num_training, num_features = xtrain.get_shape()

In [201]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(units=1200, activation='sigmoid',input_shape=(num_features,)))
#model.add(Dropout(.2,noise_shape=None, seed=None))
#model.add(Dense(units=4200, activation='sigmoid'))
#model.add(Dense(units=500, activation='softmax'))
#model.dropout(.2)
model.add(Dense(units=240, activation='tanh'))
model.add(Dense(units=5, activation='relu'))



In [202]:
model.compile(loss='categorical_crossentropy',optimizer='sgd',metrics=['accuracy'])

In [203]:
model.fit(xtrain,ytrain,epochs=1,batch_size=32)

Epoch 1/1


<tensorflow.python.keras.callbacks.History at 0x5c58e4358>

In [162]:
model.evaluate(xtest,ytest,batch_size=128)



[7.802059611640013, 0.43508951413357044]

In [95]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_39 (Dense)             (None, 5000)              101285000 
_________________________________________________________________
dense_40 (Dense)             (None, 780)               3900780   
_________________________________________________________________
dense_41 (Dense)             (None, 120)               93720     
_________________________________________________________________
dense_42 (Dense)             (None, 5)                 605       
Total params: 105,280,105
Trainable params: 105,280,105
Non-trainable params: 0
_________________________________________________________________
