In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
import pandas as pd
import spacy
import exam_functions as fn
import nltk
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential

from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from joblib import dump
from sklearn.preprocessing import LabelEncoder

ModuleNotFoundError: No module named 'exam_functions'

In [None]:
df= pd.read_csv('part1-aml-belt-exam.csv')
df.head()

In [None]:
nlp_light = spacy.load("en_core_web_sm", disable=['parser','ner'])
nlp_light.pipe_names

In [None]:
df['tokens'] = fn.batch_preprocess_texts(df['text'], nlp = nlp_light)
df.head()

In [None]:
df['lemmas'] = fn.batch_preprocess_texts(df['text'], nlp = nlp_light, use_lemmas = True)
df.head()

In [None]:
# Join into strings
df['tokens-joined'] = df['tokens'].map(lambda x: " ".join(x))

df['lemmas-joined'] = df['lemmas'].map(lambda x: " ".join(x))
df.head(3)

**Comparing Groups: Poe vs Dickens**

In [None]:
df['author'].unique()

In [None]:
# Filters
filter_poe = df['author'] == 'Poe'
filter_dickens = df['author'] == 'Dickens'
filter_poe.sum(), filter_dickens.sum()

In [None]:
# Single string for Poe
poe_lemmas = " ".join( df.loc[filter_poe, 'lemmas-joined'])
print(poe_lemmas[:1000],"\n")

In [None]:
# Single string for Dickens
dickens_lemmas = " ".join( df.loc[filter_dickens, 'lemmas-joined'])
print(dickens_lemmas[:1000],"\n")

In [None]:
# Word clouds
poe_cloud = WordCloud(random_state = 42,
                      width = 800,
                      height = 1000,
                      min_word_length = 2, colormap="Reds",
                     ).generate(poe_lemmas)
dickens_cloud = WordCloud(random_state = 42,
                       width = 800,
                       height= 1000,
                       min_word_length = 2,
                       colormap="Blues",
                      ).generate(dickens_lemmas)
# Plot the wordclouds side by side
fig, axes = plt.subplots(ncols=2, figsize=(10, 7))
axes[0].imshow(poe_cloud, interpolation='bilinear')
axes[0].set_title("Poe")
axes[1].imshow(dickens_cloud, interpolation='bilinear')
axes[1].set_title("Dickens")
[ax.axis('off') for ax in axes]
fig.tight_layout();
fig.suptitle('Authors', fontsize=20, y=1.05);

In [None]:
poe_tokens_exploded_list = df.loc[filter_poe, 'tokens'].explode().astype(str).to_list()
poe_tokens_exploded_list[:20]

In [None]:
dickens_tokens_exploded_list = df.loc[filter_dickens, 'tokens'].explode().astype(str).to_list()
dickens_tokens_exploded_list[:20]

**Bigrams**

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [None]:
# Poe bigrams
bigram_finder_high = nltk.BigramCollocationFinder.from_words(poe_tokens_exploded_list)

bigrams_scores_poe = bigram_finder_high.score_ngrams(bigram_measures.raw_freq)
bigrams_scores_poe[:20]

In [None]:
# Dickens bigrams
bigram_finder_dickens = nltk.BigramCollocationFinder.from_words(dickens_tokens_exploded_list)

bigrams_scores_dickens = bigram_finder_dickens.score_ngrams(bigram_measures.raw_freq)
bigrams_scores_dickens[:20]

**Text Classification - Machine Learning**

In [None]:
X = df['text']
y = df['author']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,
                                                    random_state=321)

In [None]:
# Vectorizer
count_vectorizer = CountVectorizer()
count_vectorizer.fit(X_train)

In [None]:
X_train_counts = count_vectorizer.transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

In [None]:
def classification_metrics(y_true, y_pred, label='',
                           output_dict=False, figsize=(8,4),
                           normalize='true', cmap='Blues',
                           colorbar=False,values_format=".2f"):
    """Modified version of classification metrics function from Intro to Machine Learning.
    Updates:
    - Reversed raw counts confusion matrix cmap  (so darker==more).
    - Added arg for normalized confusion matrix values_format
    """
    # Get the classification report
    report = classification_report(y_true, y_pred)

    ## Print header and report
    header = "-"*70
    print(header, f" Classification Metrics: {label}", header, sep='\n')
    print(report)

    ## CONFUSION MATRICES SUBPLOTS
    fig, axes = plt.subplots(ncols=2, figsize=figsize)

    # Create a confusion matrix  of raw counts (left subplot)
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                                            normalize=None,
                                            cmap='gist_gray_r',# Updated cmap
                                            values_format="d",
                                            colorbar=colorbar,
                                            ax = axes[0]);
    axes[0].set_title("Raw Counts")

    # Create a confusion matrix with the data with normalize argument
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred,
                                            normalize=normalize,
                                            cmap=cmap,
                                            values_format=values_format, #New arg
                                            colorbar=colorbar,
                                            ax = axes[1]);
    axes[1].set_title("Normalized Confusion Matrix")

    # Adjust layout and show figure
    fig.tight_layout()
    plt.show()

    # Return dictionary of classification_report
    if output_dict==True:
        report_dict = classification_report(y_true, y_pred, output_dict=True)
        return report_dict


def evaluate_classification(model, X_train, y_train, X_test, y_test,
                         figsize=(6,4), normalize='true', output_dict = False,
                            cmap_train='Blues', cmap_test="Reds",colorbar=False):
  # Get predictions for training data
  y_train_pred = model.predict(X_train)
  # Call the helper function to obtain regression metrics for training data
  results_train = classification_metrics(y_train, y_train_pred, #verbose = verbose,
                                     output_dict=True, figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_train,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = model.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = classification_metrics(y_test, y_test_pred, #verbose = verbose,
                                  output_dict=True,figsize=figsize,
                                         colorbar=colorbar, cmap=cmap_test,
                                    label='Test Data' )
  if output_dict == True:
    # Store results in a dataframe if ouput_frame is True
    results_dict = {'train':results_train,
                    'test': results_test}
    return results_dict

In [None]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_counts, y_train)
# metrics
evaluate_classification(clf, X_train_counts, y_train, X_test_counts, y_test)

In [None]:
# Model Pipeline
clf_pipe = Pipeline([('vectorizer', CountVectorizer(stop_words='english')),
                     ('clf',RandomForestClassifier(random_state=42))])
clf_pipe

In [None]:
clf_pipe.fit(X_train, y_train)

evaluate_classification(clf_pipe, X_train,y_train, X_test, y_test)

In [None]:
#dump(clf_pipe, 'models/model_pipeline.joblib')

**Text Classification - Deep NLP**

In [None]:
batch_size = 32

In [None]:
df['author'].value_counts(normalize = True)

In [None]:
label_encoder = LabelEncoder()
df['author_encoded'] = label_encoder.fit_transform(df['author'])
df

In [None]:
df['paragraph_length'] = df['text'].apply(len)
df.max()

In [None]:
df

In [None]:
X = df['text'].values
X

In [None]:
y= df['author_encoded']
y.value_counts()

In [None]:
classes = y.unique()
classes

In [None]:
SEQUENCE_LENGTH = 71
EMBED_DIM = 100

# Text Vectorization layer
sequence_vectorizer = tf.keras.layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH)

In [None]:
VOCAB_SIZE = sequence_vectorizer.vocabulary_size()
VOCAB_SIZE

In [None]:
def build_gru_model_bi_pool(text_vectorization_layer, VOCAB_SIZE, EMBED_DIM, SEQUENCE_LENGTH, classes):
    model = Sequential([
        text_vectorization_layer,
        layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, input_length=SEQUENCE_LENGTH),
        layers.Bidirectional(layers.GRU(128, return_sequences=True)),
        layers.GlobalMaxPooling1D(),
        layers.Dense(len(classes), activation='softmax')
    ])

    optimizer = optimizers.Adam()
    model.compile(optimizer=optimizer,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model

In [None]:
# def get_callbacks(patience=3, monitor='val_accuracy'):
#     early_stop = tf.keras.callbacks.EarlyStopping(patience=patience, monitor=monitor)
#     return [early_stop]

In [None]:
# Assuming VOCAB_SIZE, EMBED_DIM, SEQUENCE_LENGTH, and classes are defined
model = build_gru_model_bi_pool(sequence_vectorizer, VOCAB_SIZE, EMBED_DIM, SEQUENCE_LENGTH, classes)

# Build the model before printing the summary
model.build(input_shape=(None, SEQUENCE_LENGTH))

# Print model summary
model.summary()