### Exercise B.1: Preprocessing
1. Merge the corpora and preprocess the data.
2. Train a naive Bayes classifier to differentiate between Taylor Swift and Bob Dylan lyrics.
3. Analyze the performance of the model on test data using the confusion matrix.
Hints:
• The Python package ’sklearn.model_selection’ provides the ’train_test_split’ function.
• The Python package ’sklearn.naive_bayes’ provides the ’MultinomialNB’ function.
• The Python package ’sklearn.metrics’ provides the ’confusion_matrix’ function.

In [14]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

In [15]:
# read swift
file = open("taylor-swift.txt", "r", encoding="utf8")
swift = file.readlines()
file.close()

# read dylan
file = open("bob-dylan.txt", "r", encoding="utf8")
dylan = file.readlines()
file.close()

In [16]:
# add labels and merge
dylan_labels = [1] * len(dylan)
swift_labels = [0] * len(swift)

dict = {'lyrics': dylan, 'is_dylan': dylan_labels}
df = pd.DataFrame(dict)

dict = {'lyrics': swift, 'is_dylan': swift_labels}
df = pd.concat([df, pd.DataFrame(dict)], ignore_index=True)

In [17]:
# little bit of cleanup
del dict, dylan, dylan_labels, swift, swift_labels, file

In [18]:
X = df['lyrics'].to_numpy()
y = df['is_dylan'].to_numpy()

In [19]:
# pre-processing
# feature extraction
# vectorizer does tokenization, data already lowercased
# https://scikit-learn.org/stable/modules/feature_extraction.html
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer
# Bag of words
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1))
X_BOW = vectorizer.fit_transform(X)

In [20]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_BOW, y)

In [21]:
# train
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
# predict
y_hat = clf.predict(X_test)

In [22]:
# evaluate
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
accuracy = accuracy_score(y_test, y_hat)
f1 = f1_score(y_test, y_hat, zero_division=1.0)
print(tn, fp, fn, tp, accuracy, f1)

40 7 4 90 0.9219858156028369 0.9424083769633509


In [23]:
# trying to encode the documents using TFIDF instead of words
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html#sklearn.feature_extraction.text.TfidfTransformer
# encode
vectorizer = TfidfVectorizer()
X_TFIDF = vectorizer.fit_transform(X)
# split
X_train, X_test, y_train, y_test = train_test_split(X_TFIDF, y)
# train
clf = MultinomialNB()
clf.fit(X_train, y_train)
# predict
y_hat = clf.predict(X_test)
# evaluate
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
accuracy = accuracy_score(y_test, y_hat)
f1 = f1_score(y_test, y_hat, zero_division=1.0)
# got a bit worse
print(tn, fp, fn, tp, accuracy, f1)

0 41 0 100 0.7092198581560284 0.8298755186721991


In [24]:
# try removing stopwords, train, evaluate again
# encode
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1), stop_words='english')
X_BOW = vectorizer.fit_transform(X)
# split
X_train, X_test, y_train, y_test = train_test_split(X_BOW, y)
# train
clf = MultinomialNB()
clf.fit(X_train, y_train)
# predict
y_hat = clf.predict(X_test)
# evaluate
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
accuracy = accuracy_score(y_test, y_hat)
f1 = f1_score(y_test, y_hat, zero_division=1.0)
# a tiny bit worse. maybe the stopwords contribute to the writing style a bit
print(tn, fp, fn, tp, accuracy, f1)

35 5 14 87 0.8652482269503546 0.9015544041450778


In [25]:
# try to use bag of bigrams instead of bag of words
# encode
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X_BOW = vectorizer.fit_transform(X)
# split
X_train, X_test, y_train, y_test = train_test_split(X_BOW, y)
# train
clf = MultinomialNB()
clf.fit(X_train, y_train)
# predict
y_hat = clf.predict(X_test)
# evaluate
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
accuracy = accuracy_score(y_test, y_hat)
f1 = f1_score(y_test, y_hat, zero_division=1.0)
# way worse
print(tn, fp, fn, tp, accuracy, f1)

40 2 52 47 0.6170212765957447 0.6351351351351352


In [26]:
# try to use bag of trigrams
# encode
vectorizer = CountVectorizer(analyzer='word', ngram_range=(3, 3))
X_BOW = vectorizer.fit_transform(X)
# split
X_train, X_test, y_train, y_test = train_test_split(X_BOW, y)
# train
clf = MultinomialNB()
clf.fit(X_train, y_train)
# predict
y_hat = clf.predict(X_test)
# evaluate
tn, fp, fn, tp = confusion_matrix(y_test, y_hat).ravel()
accuracy = accuracy_score(y_test, y_hat)
f1 = f1_score(y_test, y_hat, zero_division=1.0)
# waaaaaaay worse
print(tn, fp, fn, tp, accuracy, f1)

39 0 102 0 0.2765957446808511 0.0
