<h3>
<b>Final Project: Paraphrase Identification by Multi-layer Perceptron</b>
</h3>
<blockquote>
  <h5>Stephen Hullender<br/>Foundations in Machine Learning - CIS 4526<br/>Fall 2022</h5>
</blockquote>

<h4>Imports</h4>

In [None]:
# basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, classification_report 
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# scipy
from scipy import spatial

# nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.translate import bleu_score, meteor_score, nist_score
from nltk.util import ngrams

# tensorflow
import tensorflow as tf
from tensorflow.python import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Activation 
from keras.metrics import BinaryAccuracy

# more NLP
try:
  from jiwer import compute_measures
except:
  !sudo pip3 install jiwer
  from jiwer import compute_measures

# others
import calendar
import sys
import re
import time
import math

# configurations
pd.set_option('display.max_rows', sys.maxsize)
pd.set_option('display.max_columns', 500)

<h4>Loading Text Files</h4>

In [None]:
# fetch from Google Drive
SRC = "gdrive/My Drive/MLFinal"
SRC_TRN = f'{SRC}/train_with_label.txt'
SRC_DEV = f'{SRC}/dev_with_label.txt'
SRC_TST = f'{SRC}/test_without_label.txt'
LL = ['instance_id', 'sentence_1', 'sentence_2', 'gold_label']

def load_files():
  global train, dev, test
  train = pd.read_csv(SRC_TRN, delimiter='[\t]+', names=LL, on_bad_lines='skip', encoding='utf-8')
  dev = pd.read_csv(SRC_DEV, delimiter='[\t]+', names=LL, on_bad_lines='skip', encoding='utf-8')
  test = pd.read_csv(SRC_TST, delimiter='[\t]+', names=LL, on_bad_lines='skip', encoding='utf-8')

from google.colab import files, drive
try:
  load_files()
except:
  drive.mount('/content/gdrive'); load_files()

# test data: 7801 instances (estimated 2496 positive, 5305 negative)
# dev data: 4000 instances (1K positive, 3K negative)
# test data: 4000 instances (1K positive, 3K negative)

# after on_bad_lines='skip'
# train: 7578 (97.14%) || dev: 3809 (95.23%) || test: 3884 (97.10%)

# after adding delimiter from '\t' to '[\t]+'
# train: 7801 (100%) || dev: 4000 (100%) || test: 4000 (100%)

<h4>Data Preprocessing</h4>

<h6>Contractions</h6>

In [None]:
# for processing words, distinguish contractions and make them separate words...

contractions = { 
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it would",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "that'd": "that had",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there would",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we would",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you would",
  "you'd've": "you would have",
  "you'll": "you will",
  "you'll've": "you will have",
  "you're": "you are",
  "you've": "you have"
}

<h6>Regular Expressions, Stopwords</h6>

In [None]:
def regex(s: str):
  s = re.sub(r'https?:\/\/.*[\r\n]*', '', s, flags=re.MULTILINE)
  s = re.sub(r'\<a href', ' ', s)
  s = re.sub(r'&amp;', '', s) 
  s = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', s)
  s = re.sub(r'<br />', ' ', s)
  s = re.sub(r'\'', ' ', s)
  return s;

def stops(s: str):
  s = word_tokenize(s)
  stops = set(stopwords.words('english'))
  words = []
  for word in s:
    if not word in stops:
      words.append(word)
  return " ".join(words)

def clean_text(sentence: str):
  sentence = sentence.lower()
  sentence = sentence.split()
  text = []
  for word in sentence:
    text.append(contractions[word] if word in contractions else word)
  sentence = " ".join(text)
  sentence = regex(sentence)
  sentence = stops(sentence)
  sentence = WordPunctTokenizer().tokenize(sentence)
  return sentence

<h6>Lemmatization</h6>

In [None]:
# use the functions above to parse all words into separate elements in a list
# and take out punctuation and stopwords

# lemmatize text, last step in cleaning words
lemmatizer = WordNetLemmatizer()

for table in [train, dev, test]:
  table['new_sentence_1'] = list(map(clean_text, table['sentence_1']))
  table['new_sentence_2'] = list(map(clean_text, table['sentence_2']))

  # lemmatize first column
  table['new_sentence_1'] = list(map(
    lambda word: list(map(lemmatizer.lemmatize, word)),
    table['new_sentence_1']
  ))

  # lemmatize second column
  table['new_sentence_2'] = list(map(
    lambda word: list(map(lemmatizer.lemmatize, word)),
    table['new_sentence_2']
  ))

  # & drop after tokenizing
  table.drop(['sentence_1', 'sentence_2'], axis=1, inplace=True)

test.drop('gold_label', axis=1, inplace=True)

In [None]:
# for training and dev data, separate labels from everything else...
gold_train = train['gold_label'].tolist()
gold_dev = dev['gold_label'].tolist()

<h4>TFIDF Vectorization</h4>

In [None]:
# TFIDF
# prepare data
n1 = train['new_sentence_1'] ; n2 = train['new_sentence_2']
nd1 = dev['new_sentence_1'] ; nd2 = dev['new_sentence_2']
nx1 = test['new_sentence_1'] ; nx2 = test['new_sentence_2']

SP = " " # important to keep it spaced

# vectorize
tfidf_train_1 = TfidfVectorizer(use_idf=True, smooth_idf=True, preprocessor=SP.join, stop_words='english', lowercase=False)
arr_train_1 = tfidf_train_1.fit_transform(n1).toarray()

tfidf_train_2 = TfidfVectorizer(use_idf=True, smooth_idf=True, preprocessor=SP.join, stop_words='english', lowercase=False)
arr_train_2 = tfidf_train_2.fit_transform(n2).toarray()

tfidf_dev_1 = TfidfVectorizer(use_idf=True, smooth_idf=True, preprocessor=SP.join, stop_words='english', lowercase=False)
arr_dev_1 = tfidf_dev_1.fit_transform(nd1).toarray()

tfidf_dev_2 = TfidfVectorizer(use_idf=True, smooth_idf=True, preprocessor=SP.join, stop_words='english', lowercase=False)
arr_dev_2 = tfidf_dev_2.fit_transform(nd2).toarray()

tfidf_test_1 = TfidfVectorizer(use_idf=True, smooth_idf=True, preprocessor=SP.join, stop_words='english', lowercase=False)
arr_test_1 = tfidf_test_1.fit_transform(nx1).toarray()

tfidf_test_2 = TfidfVectorizer(use_idf=True, smooth_idf=True, preprocessor=SP.join, stop_words='english', lowercase=False)
arr_test_2 = tfidf_test_2.fit_transform(nx2).toarray()

<h4>Include Missing Features for Training, Dev, and Testing Datasets</h4>

In [None]:
# add x number of arrays of same length to accomodate for missing words
# get words that are available in one set but not another
train_1_not_2 = list(set(tfidf_train_1.get_feature_names_out()).difference(tfidf_train_2.get_feature_names_out()))
train_2_not_1 = list(set(tfidf_train_2.get_feature_names_out()).difference(tfidf_train_1.get_feature_names_out()))
dev_1_not_2 = list(set(tfidf_dev_1.get_feature_names_out()).difference(tfidf_dev_2.get_feature_names_out()))
dev_2_not_1 = list(set(tfidf_dev_2.get_feature_names_out()).difference(tfidf_dev_1.get_feature_names_out()))
test_1_not_2 = list(set(tfidf_test_1.get_feature_names_out()).difference(tfidf_test_2.get_feature_names_out()))
test_2_not_1 = list(set(tfidf_test_2.get_feature_names_out()).difference(tfidf_test_1.get_feature_names_out()))

In [None]:
# change feature names to fit all words
def change_features(arr, exclusion):
  ret = []
  for i in range(len(arr)):
    temp = list(arr[i])
    temp.extend([0] * len(exclusion))
    ret.append(temp)
  return np.array(ret)

# use this later to replace new_sentence_X with encoding
arr_train_1 = change_features(arr_train_1, train_2_not_1)
arr_train_2 = change_features(arr_train_2, train_1_not_2)
arr_dev_1 = change_features(arr_dev_1, dev_2_not_1)
arr_dev_2 = change_features(arr_dev_2, dev_1_not_2)
arr_test_1 = change_features(arr_test_1, test_2_not_1)
arr_test_2 = change_features(arr_test_2, test_1_not_2)

In [None]:
# conjoin features 
features_train = list(tfidf_train_1.get_feature_names_out())
features_train.extend(train_2_not_1)

features_dev = list(tfidf_dev_1.get_feature_names_out())
features_dev.extend(dev_2_not_1)

features_test = list(tfidf_test_1.get_feature_names_out())
features_test.extend(test_2_not_1)

<h4>FEATURE: Overlapping Words</h4>

In [None]:
def get_overlapping(arr1, arr2):
  assert len(arr1) == len(arr2)
  over = []
  for i in range(len(arr1)):
    set_doc1 = set(arr1[i]); set_doc2 = set(arr2[i])
    overlaps = set_doc1 & set_doc2
    score = abs(
        (len(overlaps) / len(list(set_doc1))) - (len(overlaps) / len(list(set_doc2)))
    )
    over.append(score)
  return over

overlap_train = get_overlapping(n1, n2)
overlap_dev = get_overlapping(nd1, nd2)
overlap_test = get_overlapping(nx1, nx2)

print(overlap_train, '\n', overlap_dev, '\n', overlap_test)

<h4>
  FEATURE: Jaccard Similarity
</h4>

In [None]:
#
# FEATURE: Jaccard
#
def jaccard(arr1, arr2):
  assert len(arr1) == len(arr2)
  ret = []
  for i in range(len(arr1)):
    doc1 = arr1[i]; doc2 = arr2[i]
    s1 = set(doc1); s2 = set(doc2)
    ret.append(
        float(len(s1 & s2) / len(s1.union(s2)))
    )
  return ret

jaccard_similarity_train = jaccard(n1, n2)
jaccard_similarity_dev = jaccard(nd1, nd2)
jaccard_similarity_test = jaccard(nx1, nx2)

print(jaccard_similarity_train, '\n', jaccard_similarity_dev, '\n', jaccard_similarity_test)

<h4>
  FEATURE: Cosine Similarity (Average, Count, Average on Nonzero Occurrences)
</h4>

In [None]:
# each cosine_sim_XX returns a cosine similarity score based on comparing the two sentences
# the reason for 1 - spatial.distance.cosine is to only calculate similarity (otherwise, you'll get the difference)
def get_cosine(n1, n2):
  assert len(n1) == len(n2)
  return [1 - spatial.distance.cosine(n1[i], n2[i]) for i in range(len(n1))]

cosine_sim_train = get_cosine(arr_train_1, arr_train_2)
cosine_sim_dev = get_cosine(arr_dev_1, arr_dev_2)
cosine_sim_test = get_cosine(arr_test_1, arr_test_2)

print(len(arr_train_1)) # total number of rows (documents)
print(len(arr_train_1[0])) # for each row, print all possible cosine distances for each available word (feature)

In [None]:
# average cosine score for all 
def get_avg_cosine(arr):
  return np.average(arr)

# ::::: EXTRA: average cosine similarity per document
avg_cosine_train = get_avg_cosine(cosine_sim_train)
avg_cosine_dev = get_avg_cosine(cosine_sim_dev)
avg_cosine_test = get_avg_cosine(cosine_sim_test)

print("Average cosine similarity scores:")
print(avg_cosine_train, avg_cosine_dev, avg_cosine_test)

# out of the scores given for each data, how many are zeroes
def get_no_match(arr):
  ret = arr.count(0.00)
  return ret / len(arr)

# average cosine for only scores that are non-zero
def get_avg_cosine_nonzero(arr):
  ret = []
  for a in arr:
    if a > 0.00:
      ret.append(a)
  return sum(ret) / len(ret)

# ::::: EXTRA: count how many matches in each document return no similarity (cosine)
no_match_cosine_train = get_no_match(cosine_sim_train)
no_match_cosine_dev = get_no_match(cosine_sim_dev)
no_match_cosine_test = get_no_match(cosine_sim_test)

print("How many are no matches:")
print(no_match_cosine_train, no_match_cosine_dev, no_match_cosine_test)

# ::::: EXTRA: count all non-zero averages for cosine similarity
nonzero_avg_train = get_avg_cosine_nonzero(cosine_sim_train)
nonzero_avg_dev = get_avg_cosine_nonzero(cosine_sim_dev)
nonzero_avg_test = get_avg_cosine_nonzero(cosine_sim_test)

print("Nonzero averages:")
print(nonzero_avg_train, nonzero_avg_dev, nonzero_avg_test)

<h4>
  FEATURE: BLEU Scores
</h4>

In [None]:
# calculate BLEU scores
# each array in the returned list contains the results of each document, differing by weight
def bleu(arr1, arr2):
  bleu_features = ["BLEU_1", "BLEU_2", "BLEU_3", "BLEU_4"]
  bleus = [[], [], [], []]
  assert len(arr1) == len(arr2)
  for i in range(len(arr1)):
    doc1 = arr1[i]; doc2 = arr2[i]
    for x in range(len(bleu_features)):
      bleus[x].append(
          round(bleu_score.sentence_bleu([doc1], doc2, weights=([1/(x+1)] * (x+1))), 12)
      )
  return bleus 

#
# ::::: FEATURE: BLEU scores (4-set)
#
blue_train = bleu(n1, n2)
# e.g. [[weights=1], [weights=(1/2)*2], [weights=(1/3)*3], [weights=(1/4)*4]]
blue_dev = bleu(nd1, nd2)
blue_test = bleu(nx1, nx2)

print(blue_train, '\n', blue_dev, '\n', blue_test)
print(len(blue_train))

<h4>
  FEATURE: METEOR Scores
</h4>

In [None]:
# calculate METEOR scores
#
def meteor(arr1, arr2):
  assert len(arr1) == len(arr2);
  m = []
  for i in range(len(arr1)):
    doc1 = arr1[i]; doc2 = arr2[i]
    m.append(
        round(meteor_score.single_meteor_score(doc1, doc2), 6)
    )
  return m

#
# ::::: FEATURE: METEOR scores
#
meteor_train = meteor(n1, n2)
meteor_dev = meteor(nd1, nd2)
meteor_test = meteor(nx1, nx2)

print(meteor_train, '\n', meteor_dev, '\n', meteor_test)
print(len(meteor_train))

<h4>
  FEATURE: NIST Scores
</h4>

In [None]:
# calculate NIST scores
def nist(arr1, arr2):
  assert len(arr1) == len(arr2)
  n = [[], [], []]
  for i in range(len(arr1)):
    doc1 = arr1[i]; doc2 = arr2[i]
    for x in range(3):
      n[x].append(
        nist_score.sentence_nist([doc1], doc2, n=(x+1))
      )
    # n=3 is the highest n-gram order that works without division-by-zero error.
  return n

#
# ::::: FEATURE: NIST scores (3-set)
#
nist_train = nist(n1, n2)
# e.g. [[n=1], [n=2], [n=3]]
nist_dev = nist(nd1, nd2)
nist_test = nist(nx1, nx2)

print(nist_train, '\n', nist_dev, '\n', nist_test)

<h4>
  FEATURES: Bigram & Trigram Features
</h4>

In [None]:
def get_gram(n, arr1, arr2):
  assert len(arr1) == len(arr2)
  ret_union = []
  ret_intersection = []
  ret_grams1 = []
  ret_grams2 = []
  for i in range(len(arr1)):
    doc1 = arr1[i]; doc2 = arr2[i]
    g1 = [doc1[j:j+n] for j in range(len(doc1))]
    g2 = [doc2[j:j+n] for j in range(len(doc2))]
    # total number of n-grams available for both documents
    ret_union.append(len(g1) + len(g2))
    # number of n-grams that match between the two documents
    ret_intersection.append(len([x for x in g1 if x in g2]))
    # and for g1 and g2, the number of n-grams from each individual document
    ret_grams1.append(len(g1))
    ret_grams2.append(len(g2))
  return ret_union, ret_intersection, ret_grams1, ret_grams2

# for n=2
bigram_union_train, bigram_inter_train, bigram_1_train, bigram_2_train = get_gram(2, n1, n2)
bigram_union_dev, bigram_inter_dev, bigram_1_dev, bigram_2_dev = get_gram(2, nd1, nd2)
bigram_union_test, bigram_inter_test, bigram_1_test, bigram_2_test = get_gram(2, nx1, nx2)

print(bigram_union_train, '\n', bigram_inter_train, '\n', bigram_1_train, '\n', bigram_2_train)

# for n=3
trigram_union_train, trigram_inter_train, trigram_1_train, trigram_2_train = get_gram(3, n1, n2)
trigram_union_dev, trigram_inter_dev, trigram_1_dev, trigram_2_dev = get_gram(3, nd1, nd2)
trigram_union_test, trigram_inter_test, trigram_1_test, trigram_2_test = get_gram(3, nx1, nx2)

print(trigram_union_train, '\n', trigram_inter_train, '\n', trigram_1_train, '\n', trigram_2_train)

<h4>
  FEATURE: Levenshtein distance
</h4>

In [None]:
from functools import lru_cache

In [None]:
def lev_distance(arr1, arr2):

  @lru_cache(None)
  def min_distance(s1, s2):
    if s1 == len(arr1) or s2 == len(arr2):
      return len(arr1) - s1 + len(arr2) - s2
    if arr1[s1] == arr2[s2]:
      return min_distance(s1+1, s2+1)
    return 1 + min(
      min_distance(s1, s2 + 1),
      min_distance(s1 + 1, s2),
      min_distance(s1 + 1, s2 + 1)
    )
  return min_distance(0, 0)

def calculate_lev(arr1, arr2):
  assert len(arr1) == len(arr2)
  ret = []
  for i in range(len(arr1)):
    doc1 = arr1[i]; doc2 = arr2[i]
    ret.append(lev_distance(doc1, doc2))
  return ret

#
# ::::: FEATURES: levenshtein distance 
#
lev_train = calculate_lev(n1, n2)
lev_dev = calculate_lev(nd1, nd2)
lev_test = calculate_lev(nx1, nx2)

print(lev_train, '\n', lev_dev, '\n', lev_test)

<h4>
  FEATURE: Euclidean distance
</h4>

In [None]:
def euc(arr1, arr2):
  ret = []
  assert len(arr1) == len(arr2)
  for i in range(len(arr1)):
    x = euclidean_distances([arr1[i]], [arr2[i]])
    ret.append(x[0].tolist()[0])
  #for i in range(len(arr1)):
  #  doc1 = np.array(arr1[i]); doc2 = np.array(arr2[i])
  #  e = np.linalg.norm(doc1 - doc2)
  #  ret.append(e)
  return ret

#
# ::::: FEATURES: euclidean
#
euc_train = euc(arr_train_1, arr_train_2)
euc_dev = euc(arr_dev_1, arr_dev_2)
euc_test = euc(arr_test_1, arr_test_2)

print(euc_train, '\n', euc_dev, '\n', euc_test)

<h4>
  FINAL FEATURE: Word Error Rate
</h4>

In [None]:
# using wer (from jiwer)
def conjoin_for_word_error(doc):
  return " ".join(doc)

def word_error(arr1, arr2):
  ret = []
  assert len(arr1) == len(arr2)
  for i in range(len(arr1)):
    doc1 = arr1[i]; doc2 = arr2[i]
    doc1 = conjoin_for_word_error(doc1)
    doc2 = conjoin_for_word_error(doc2)
    errors = compute_measures(doc1, doc2)
    ret.append(errors)
  print(ret)
  return ret

def get_wer(arr1, arr2):
  values = word_error(arr1, arr2)
  wer = []; wil = []
  subs = []; dels = []; inserts = []
  for v in values:
    wer.append(v['wer'])
    wil.append(v['wil'])
    subs.append(v['substitutions'])
    dels.append(v['deletions'])
    inserts.append(v['insertions'])
  ret = [wer, wil, subs, dels, inserts]
  return ret

wer_train = get_wer(n1, n2)
wer_dev = get_wer(nd1, nd2)
wer_test = get_wer(nx1, nx2)
# prints out: wer, mer, wil, wip, hits, substitutions, deletions, insertions

<h4>
  Putting Everything Together

</h4>

In [None]:
final_instance_ids = test['instance_id'].tolist()

In [None]:
# save results
def print_results(a, p, keras=0):
  lbl = ["accuracy", "precision", "recall", "F1"]
  AVG='weighted'

  acc = accuracy_score(a, p)
  pre = precision_score(a, p, average=AVG)
  rec = recall_score(a, p, average=AVG)
  f1 = f1_score(a, p, average=AVG)

  data = [acc, pre, rec, f1]
  df = pd.DataFrame(data, index=lbl)

  ts = str(calendar.timegm(time.gmtime()))
  df['timestamp'] = ts
  df['model'] = "keras" if keras > 0 else "sklearn"

  filename = f'{SRC}/mlfinal_list_of_results.txt'

  with open(filename, 'a') as f:
    dfAsString = df.to_string(header=True, index=True)
    f.write(dfAsString + "\n")
    f.close()

  print('ACCURACY: ', acc)
  print('PRECISION: ', pre)
  print('RECALL: ', rec)
  print("F1-SCORE: ", f1)

In [None]:
# ::: All features :::
# overlapping words
# jaccard similarity
# cosine similarity
# BLEU (4)
# METEOR
# NIST (3)
# bigram
# trigram
# Levenshtein
# Euclidean
# & word error rate features { wer, wil, substitutions, deletions, insertions } (5)

data_train = train

data_train.drop('new_sentence_1', axis=1, inplace=True)
data_train.drop('new_sentence_2', axis=1, inplace=True)
data_train.drop('instance_id', axis=1, inplace=True)

data_train["overlapping_words"] = overlap_train
data_train["jaccard_similarity"] = jaccard_similarity_train
data_train["cosine_similarity"] = cosine_sim_train
data_train["bleu_1"] = blue_train[0]
data_train["bleu_2"] = blue_train[1]
data_train["bleu_3"] = blue_train[2]
data_train["bleu_4"] = blue_train[3]
data_train["meteor"] = meteor_train
data_train["nist_1"] = nist_train[0]
data_train["nist_2"] = nist_train[1]
data_train["nist_3"] = nist_train[2]
data_train["bigram_union"] = bigram_union_train
data_train["bigram_intersection"] = bigram_inter_train
data_train["bigram_1"] = bigram_1_train 
data_train["bigram_2"] = bigram_2_train 
data_train["trigram_union"] = trigram_union_train
data_train["trigram_intersection"] = trigram_inter_train 
data_train["trigram_1"] = trigram_1_train 
data_train["trigram_2"] = trigram_2_train 
data_train["levenshtein"] = lev_train
data_train["euclidean"] = euc_train
data_train['word_error_rate'] = wer_train[0]
data_train['word_info_lost'] = wer_train[1]
data_train['wer_substitutions'] = wer_train[2]
data_train['wer_deletions'] = wer_train[3]
data_train['wer_insertions'] = wer_train[4]

# show all
data_train.head()

In [None]:
data_train.info()

In [None]:
data_train.describe()

In [None]:
data_train.isnull().sum()

In [None]:
# same thing for both dev and test sets

# DEV
data_dev = dev

data_dev.drop('new_sentence_1', axis=1, inplace=True)
data_dev.drop('new_sentence_2', axis=1, inplace=True)
data_dev.drop('instance_id', axis=1, inplace=True)

data_dev["overlapping_words"] = overlap_dev
data_dev["jaccard_similarity"] = jaccard_similarity_dev
data_dev["cosine_similarity"] = cosine_sim_dev
data_dev["bleu_1"] = blue_dev[0]
data_dev["bleu_2"] = blue_dev[1]
data_dev["bleu_3"] = blue_dev[2]
data_dev["bleu_4"] = blue_dev[3]
data_dev["meteor"] = meteor_dev
data_dev["nist_1"] = nist_dev[0]
data_dev["nist_2"] = nist_dev[1]
data_dev["nist_3"] = nist_dev[2]
data_dev["bigram_union"] = bigram_union_dev
data_dev["bigram_intersection"] = bigram_inter_dev
data_dev["bigram_1"] = bigram_1_dev 
data_dev["bigram_2"] = bigram_2_dev 
data_dev["trigram_union"] = trigram_union_dev
data_dev["trigram_intersection"] = trigram_inter_dev 
data_dev["trigram_1"] = trigram_1_dev 
data_dev["trigram_2"] = trigram_2_dev 
data_dev["levenshtein"] = lev_dev
data_dev["euclidean"] = euc_dev
data_dev['word_error_rate'] = wer_dev[0]
data_dev['word_info_lost'] = wer_dev[1]
data_dev['wer_substitutions'] = wer_dev[2]
data_dev['wer_deletions'] = wer_dev[3]
data_dev['wer_insertions'] = wer_dev[4]

# TEST
data_test = test

data_test.drop('new_sentence_1', axis=1, inplace=True)
data_test.drop('new_sentence_2', axis=1, inplace=True)
data_test.drop('instance_id', axis=1, inplace=True)

data_test["overlapping_words"] = overlap_test
data_test["jaccard_similarity"] = jaccard_similarity_test
data_test["cosine_similarity"] = cosine_sim_test
data_test["bleu_1"] = blue_test[0]
data_test["bleu_2"] = blue_test[1]
data_test["bleu_3"] = blue_test[2]
data_test["bleu_4"] = blue_test[3]
data_test["meteor"] = meteor_test
data_test["nist_1"] = nist_test[0]
data_test["nist_2"] = nist_test[1]
data_test["nist_3"] = nist_test[2]
data_test["bigram_union"] = bigram_union_test
data_test["bigram_intersection"] = bigram_inter_test
data_test["bigram_1"] = bigram_1_test 
data_test["bigram_2"] = bigram_2_test 
data_test["trigram_union"] = trigram_union_test
data_test["trigram_intersection"] = trigram_inter_test 
data_test["trigram_1"] = trigram_1_test 
data_test["trigram_2"] = trigram_2_test 
data_test["levenshtein"] = lev_test
data_test["euclidean"] = euc_test
data_test['word_error_rate'] = wer_test[0]
data_test['word_info_lost'] = wer_test[1]
data_test['wer_substitutions'] = wer_test[2]
data_test['wer_deletions'] = wer_test[3]
data_test['wer_insertions'] = wer_test[4]

In [None]:
# test train_test_split for now

# for train
X = data_train.drop('gold_label', axis=1)
y = data_train['gold_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# ... works better when random_state is 123
clf = MLPClassifier(random_state=123, max_iter=1000)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
cm = confusion_matrix(y_test, preds)
print(cm)
print_results(y_test, preds)

# for dev
A = data_dev.drop('gold_label', axis=1)
b = data_dev['gold_label'] 
preds2 = clf.predict(A.values)
cm2 = confusion_matrix(b.values, preds2)
print(cm2)
print_results(b.values, preds2)

In [None]:
# IDK THIS
# Source: https://www.youtube.com/watch?v=8_bT0z3AFmA&ab_channel=BhavyaSriYarlagadda
columns = data_train.columns.values
last_index = len(columns) - 1

j = data_train['gold_label']
I = data_train.drop(['gold_label'], axis=1)

In [None]:
I_train, I_test, j_train, j_test = train_test_split(I, j, test_size=0.3, random_state=1)

model = Sequential([
    Dense(15, activation='relu', input_shape=(last_index,)),
    Dense(15, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# if missing values are shown (should not be an issue)
for i in range(4000):
  check = f'test_id_{i}'
  if not check in final_instance_ids:
    final_instance_ids.insert(i, check)
    final_preds = np.insert(final_preds, i, '0')

In [None]:
epochs = [3000, 1200, 500, 100]
batch_size = [50, 25]
amf = [[], [], [], []]
# group by epoch

M = data_dev.drop(['gold_label'], axis=1)
n = data_dev['gold_label']

In [None]:
for e in range(len(epochs)):
  for b in batch_size:

    hist = model.fit(I_train, j_train, batch_size=b, epochs=epochs[e], validation_split=0.2)
    amf[e].append(hist)

    predictions = model.predict(I_train)
    predictions = [1 if y >= 0.5 else 0 for y in predictions]
    print(len(predictions))
    print_results(j_train, predictions, keras=1)

    more_predictions = model.predict(I_test)
    more_predictions = [1 if z >= 0.5 else 0 for z in more_predictions]
    print(len(more_predictions))
    print_results(j_test, more_predictions, keras=1)

    dev_predictions = model.predict(M.values)
    dev_predictions = [1 if y >= 0.5 else 0 for y in dev_predictions]
    print(len(dev_predictions))
    print_results(n, dev_predictions, keras=1)

    final_preds = model.predict(data_test.values)  
    final_preds = [1 if y >= 0.5 else 0 for y in final_preds]
    ts = ts = str(calendar.timegm(time.gmtime()))
    with open(f'{SRC}/mlfinal_final_results_{ts}.txt', 'a') as f:
      for i in range(len(final_preds)):
        f.write(str(final_instance_ids[i]) + '\t' + str(final_preds[i]) + '\n')
      f.close()

<h4>Results, Visualizations</h4>

In [None]:
# show for epochs 3000, batchsize 50, loss
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[0][0].history['loss'])
plt.plot(amf[0][0].history['val_loss'])
plt.title("Model Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='upper right')
plt.show()

In [None]:
# show for epochs 3000, batchsize 50, accuracy
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[0][0].history['accuracy'])
plt.plot(amf[0][0].history['val_accuracy'])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='lower right')
plt.show()

In [None]:
# show for epochs 3000, batchsize 25, loss
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[0][1].history['loss'])
plt.plot(amf[0][1].history['val_loss'])
plt.title("Model Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='upper right')
plt.show()

In [None]:
# show for epochs 3000, batchsize 25, accuracy
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[0][1].history['accuracy'])
plt.plot(amf[0][1].history['val_accuracy'])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='lower right')
plt.show()

In [None]:
# show for epochs 1200, batchsize 50, loss
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[1][0].history['loss'])
plt.plot(amf[1][0].history['val_loss'])
plt.title("Model Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='upper right')
plt.show()

In [None]:
# show for epochs 1200, batchsize 50, accuracy
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[1][0].history['accuracy'])
plt.plot(amf[1][0].history['val_accuracy'])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='lower right')
plt.show()

In [None]:
# show for epochs 1200, batchsize 25, loss
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[1][1].history['loss'])
plt.plot(amf[1][1].history['val_loss'])
plt.title("Model Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='upper right')
plt.show()

In [None]:
# show for epochs 1200, batchsize 25, accuracy
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[1][1].history['accuracy'])
plt.plot(amf[1][1].history['val_accuracy'])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='lower right')
plt.show()

In [None]:
# show for epochs 500, batchsize 50, loss
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[2][0].history['loss'])
plt.plot(amf[2][0].history['val_loss'])
plt.title("Model Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='upper right')
plt.show()

In [None]:
# show for epochs 500, batchsize 50, accuracy
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[2][0].history['accuracy'])
plt.plot(amf[2][0].history['val_accuracy'])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='lower right')
plt.show()

In [None]:
# show for epochs 500, batchsize 25, loss
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[2][1].history['loss'])
plt.plot(amf[2][1].history['val_loss'])
plt.title("Model Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='upper right')
plt.show()

In [None]:
# show for epochs 500, batchsize 25, accuracy
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[2][1].history['accuracy'])
plt.plot(amf[2][1].history['val_accuracy'])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='lower right')
plt.show()

In [None]:
# show for epochs 100, batchsize 50, loss
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[3][0].history['loss'])
plt.plot(amf[3][0].history['val_loss'])
plt.title("Model Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='upper right')
plt.show()

In [None]:
# show for epochs 100, batchsize 50, accuracy
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[3][0].history['accuracy'])
plt.plot(amf[3][0].history['val_accuracy'])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='lower right')
plt.show()

In [None]:
# show for epochs 100, batchsize 25, loss
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[3][1].history['loss'])
plt.plot(amf[3][1].history['val_loss'])
plt.title("Model Loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='upper right')
plt.show()

In [None]:
# show for epochs 100, batchsize 25, accuracy
plt.style.use('fivethirtyeight')
plt.figure(figsize=(40, 24), dpi=80)
plt.plot(amf[3][1].history['accuracy'])
plt.plot(amf[3][1].history['val_accuracy'])
plt.title("Model Accuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train", "Val"], loc='lower right')
plt.show()

In [None]:
# for final results
raise Exception

# change after looking
best_results = None;

with open(f'{SRC}/mlfinal_final_results.txt', 'a') as f:
  for i in range(len(final_emergency)):
    f.write(str(final_instance_ids[i]) + '\t' + str(best_results[i]) + '\n')
  f.close()