# Final Project

For this project, I am attempting to create a model that takes line from the movie *Finding Dory* and predicts which character said which line.

## Loading packages

In [30]:
# %load_ext autoreload
# %autoreload 2
import numpy as np
import string
# import tensorflow as tf
# from tensorflow.keras import layers, models, losses, callbacks
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, LeaveOneOut, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import VotingClassifier, BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree, export_text
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.metrics import confusion_matrix, classification_report, precision_score, get_scorer_names, mean_squared_error, r2_score, mean_squared_error, roc_auc_score, ConfusionMatrixDisplay, accuracy_score
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

In [32]:
# nltk.download('punkt_tab')
# nltk.download('stopwords')

## Loading and preprocessing data

In [35]:
script = pd.read_csv('finding_dory.csv')
script.head()

Unnamed: 0,name,line
0,Young Dory,"Hi, I'm Dory. I suffer from short-term remembe..."
1,Jenny,Yes!
2,Charlie,That's exactly what you say!
3,Jenny,"Okay, okay. We'll pretend to be the other kids..."
4,Charlie,Ahoy there! Do you wanna play hide and seek?


In [37]:
# function to clean script
def clean_script(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_tokens)

# filter out stopwords and punctuation
script['filtered_line'] = script['line'].apply(clean_script)

In [39]:
# keep main characters that have over 30 lines
line_counts = script['name'].value_counts()
main_characters = line_counts[line_counts > 30].index
filtered_script = script[script['name'].isin(main_characters)]

print('Original number of characters: {}'.format(len(script['name'].unique())))
print('Original number of lines: {} \n'.format(len(script)))
print('Number of main characters: {}'.format(len(filtered_script['name'].unique())))
print('Number of filtered lines: {} \n'.format(len(filtered_script)))

print(filtered_script['name'].unique())

Original number of characters: 74
Original number of lines: 1284 

Number of main characters: 10
Number of filtered lines: 1049 

['Young Dory' 'Jenny' 'Charlie' 'Dory' 'Marlin' 'Nemo' 'Hank' 'Fluke'
 'Destiny' 'Bailey']


In [41]:
filtered_script['name'].value_counts()

name
Dory          359
Marlin        162
Hank          119
Destiny        84
Nemo           72
Bailey         66
Charlie        58
Jenny          55
Young Dory     40
Fluke          34
Name: count, dtype: int64

## Using Ensemble Methods

In [44]:
# Load vectorizers
unigram_vectorizer = CountVectorizer(binary=True)
bigram_vectorizer = CountVectorizer(binary=True, ngram_range=(2,2))
trigram_vectorizer = CountVectorizer(binary=True, ngram_range=(3,3))
tfidf_vectorizer = TfidfVectorizer()

In [70]:
# Loading classifiers
logreg = LogisticRegression(solver = 'newton-cg', multi_class = 'multinomial')

# Bagging
dt = DecisionTreeClassifier()
bc = BaggingClassifier(estimator = dt, n_estimators = 50)

# Random forest
rf = RandomForestClassifier(n_estimators = 100, random_state = 12345)

# Gradient boosting
gbr = GradientBoostingClassifier(max_depth = 5, n_estimators = 250, random_state = 12345)

# AdaBoosting
ada = AdaBoostClassifier(estimator = dt, n_estimators = 250, random_state = 12345)

# XGBoosting 
xgb = XGBClassifier(n_estimators = 250, random_state = 12345)

## Analysis: Unfiltered Lines

In [61]:
# Create X and y variables
uni_X = unigram_vectorizer.fit_transform(filtered_script['line'])
bi_X = bigram_vectorizer.fit_transform(filtered_script['line'])
tri_X = trigram_vectorizer.fit_transform(filtered_script['line'])

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(filtered_script['name'])

In [63]:
# Define the models
models = {
    'Logistic Regression': logreg,
    'Bagging Classifier': bc,
    'Random Forest': rf,
    'Gradient Boosting': gbr,
    'AdaBoosting': ada,
    'XGBoosting': xgb}

# Number of folds for cross-validation
cv_folds = 10

In [65]:
def run_model(models, X, y, cv_folds):
    accuracy_scores = []
    for model_name, model in models.items():
        cv_scores = cross_val_score(model, X, y, cv=cv_folds, scoring = 'accuracy')
        accuracy_scores.append({
            'model': model_name,
            'mean_accuracy_score': np.mean(cv_scores)})
    results = pd.DataFrame(accuracy_scores)
    return results

### Comparing models with unigrams, bigrams, and trigrams

In [68]:
run_model(models, uni_X, y, cv_folds)



Unnamed: 0,model,mean_accuracy_score
0,Logistic Regression,0.41283
1,Bagging Classifier,0.417628
2,Random Forest,0.430971
3,Gradient Boosting,0.40619
4,AdaBoosting,0.373745
5,XGBoosting,0.394725


In [None]:
X_train, X_test, y_train, y_test = train_test_split(uni_X, y, test_size=0.3, random_state=1000)

# Logistic regression
# multinomial logit model
logreg.fit(X_train, y_train)
logreg_y_pred = logreg.predict(X_test)

# Bagging
bc.fit(X_train, y_train)
bc_y_pred = bc.predict(X_test)

# Random forest
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)

# Gradient boosting
gbr.fit(X_train, y_train)
gbr_y_pred = gbr.predict(X_test)

# AdaBoosting
ada.fit(X_train, y_train)
ada_y_pred = ada.predict(X_test)

xgb.fit(X_train, y_train)
xgb_y_pred = xgb.predict(X_test)

# Comparing models
print('Test set Accuracy of Logistic Regression: {:3f}'.format(accuracy_score(y_test, logreg_y_pred)))
print('Test set Accuracy of Bagging Classifier: {:3f}'.format(accuracy_score(y_test, bc_y_pred)))
print('Test set Accuracy of Random Forest: {:3f}'.format(accuracy_score(y_test, rf_y_pred)))
print('Test set Accuracy of Gradient Boosting: {:3f}'.format(accuracy_score(y_test, gbr_y_pred)))
print('Test set Accuracy of AdaBoosting: {:3f}'.format(accuracy_score(y_test, ada_y_pred)))
print('Test set Accuracy of XGBoosting: {:3f}'.format(accuracy_score(y_test, xgb_y_pred)))

## Analysis: Filtered Lines

In [None]:
filtered_X = unigram_vectorizer.fit_transform(filtered_script['filtered_line'])
X_train, X_test, y_train, y_test = train_test_split(filtered_X, y, test_size=0.3, random_state=1000)

# Logistic regression
# WRONG: use multinomial logistic regression
logreg.fit(X_train, y_train)
logreg_y_pred = logreg.predict(X_test)

# Bagging
bc.fit(X_train, y_train)
bc_y_pred = bc.predict(X_test)

# Random forest
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)

# Gradient boosting
gbr.fit(X_train, y_train)
gbr_y_pred = gbr.predict(X_test)

# AdaBoosting
ada.fit(X_train, y_train)
ada_y_pred = ada.predict(X_test)

# XG Boosting
xgb.fit(X_train, y_train)
xgb_y_pred = xgb.predict(X_test)

# Comparing models
print('Test set Accuracy of Logistic Regression: {:3f}'.format(accuracy_score(y_test, logreg_y_pred)))
print('Test set Accuracy of Bagging Classifier: {:3f}'.format(accuracy_score(y_test, bc_y_pred)))
print('Test set Accuracy of Random Forest: {:3f}'.format(accuracy_score(y_test, rf_y_pred)))
print('Test set Accuracy of Gradient Boosting: {:3f}'.format(accuracy_score(y_test, gbr_y_pred)))
print('Test set Accuracy of AdaBoosting: {:3f}'.format(accuracy_score(y_test, ada_y_pred)))
print('Test set Accuracy of XGBoosting: {:3f}'.format(accuracy_score(y_test, xgb_y_pred)))

In [34]:
filtered_X = unigram_vectorizer.fit_transform(filtered_script['filtered_line'])
run_model(models, filtered_X, y, cv_folds)



Unnamed: 0,model,mean_accuracy_score
0,Logistic Regression,0.418553
1,Bagging Classifier,0.373718
2,Random Forest,0.381364
3,Gradient Boosting,0.394679
4,AdaBoosting,0.355687
5,XGBoosting,0.396621


## Analysis: Using TF-IDF

In [None]:
tfidf_X = tfidf_vectorizer.fit_transform(filtered_script['filtered_line'])
X_train, X_test, y_train, y_test = train_test_split(tfidf_X, y, test_size=0.3, random_state=1000)

# Logistic regression
logreg.fit(X_train, y_train)
logreg_y_pred = logreg.predict(X_test)

# Bagging
bc.fit(X_train, y_train)
bc_y_pred = bc.predict(X_test)

# Random forest
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)

# Gradient boosting
gbr.fit(X_train, y_train)
gbr_y_pred = gbr.predict(X_test)

# AdaBoosting
ada.fit(X_train, y_train)
ada_y_pred = ada.predict(X_test)

# XG Boosting
xgb.fit(X_train, y_train)
xgb_y_pred = xgb.predict(X_test)

# Comparing models
print('Test set Accuracy of Logistic Regression: {:3f}'.format(accuracy_score(y_test, logreg_y_pred)))
print('Test set Accuracy of Bagging Classifier: {:3f}'.format(accuracy_score(y_test, bc_y_pred)))
print('Test set Accuracy of Random Forest: {:3f}'.format(accuracy_score(y_test, rf_y_pred)))
print('Test set Accuracy of Gradient Boosting: {:3f}'.format(accuracy_score(y_test, gbr_y_pred)))
print('Test set Accuracy of AdaBoosting: {:3f}'.format(accuracy_score(y_test, ada_y_pred)))
print('Test set Accuracy of XGBoosting: {:3f}'.format(accuracy_score(y_test, xgb_y_pred)))

In [38]:
tfidf_X = tfidf_vectorizer.fit_transform(filtered_script['filtered_line'])
run_model(models, tfidf_X, y, cv_folds)



Unnamed: 0,model,mean_accuracy_score
0,Logistic Regression,0.416676
1,Bagging Classifier,0.417619
2,Random Forest,0.412821
3,Gradient Boosting,0.401364
4,AdaBoosting,0.347088
5,XGBoosting,0.401419


#
# GPT Method
#

## Setting parameters

In [None]:
# Still not sure if I should update these!!
VOCAB_SIZE = 10000
MAX_LEN = 50
EMBEDDING_DIM = 256
KEY_DIM = 256
N_HEADS = 2
FEED_FORWARD_DIM = 256
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 64
EPOCHS = 25

## Tokenize data

In [None]:
# Tokenize using nltk's tokenizer
script['line_tokens'] = script['line'].apply(word_tokenize)
# put script tokens into a list
mylist = list(script.line_tokens)
mylist = [' '.join(x) for x in mylist]

In [None]:
# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(mylist)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [None]:
# Create a vectorization layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [None]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [None]:
# Display some token:word mappings
for i, word in enumerate(vocab[:15]):
    print(f"{i}: {word}")

In [None]:
# Display an example line from the script and its vector representation
example = mylist[110]
example_tokenized = vectorize_layer(example)
print(example)
print(example_tokenized.numpy())

## Creating the training set

In [None]:
# Create training set of bigrams
def prepare_bigrams(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

train_bigrams = text_ds.map(prepare_bigrams)

In [None]:
# Create training set of trigrams
def prepare_trigrams(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-2]
    y = tokenized_sentences[:, 2:]
    return x, y

train_trigrams = text_ds.map(prepare_trigrams)

Showing example input and output for trigram training set:

In [None]:
example_input_output = train_trigrams.take(1).get_single_element()

In [None]:
# Example Input
example_input_output[0][0]

In [None]:
# Example Output (shifted by two tokens)
example_input_output[1][0]

## Create causal attention mask function

In [None]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)

np.transpose(causal_attention_mask(1, 10, 10, dtype=tf.int32)[0])

## Don't know what to do next...

run a sequential model 
- attention head
- spit out result of attention head
- print result

- do the entire model
- extract what comes out of the attention head