In [None]:
%%capture
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

In [None]:
# Read data from my drive
datastore = pd.read_csv('/content/drive/MyDrive/CS3244 Dataset/train-balanced-sarcasm.csv')
total_count = len(datastore)

In [None]:
import re, string, time
from nltk.corpus import stopwords as stopwordprovider
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

"""
Sequence of pre-processing

1. Make a copy of the dataframe. All modification will only be done on the duplicate
2. Iterate through each element in df and check if comment or label is NULL. Drop the row if comment or label is missing
3. Remove punctuation
4. Lowercase all words
5. Tokenize sentence into small units ( usually single words )
6. Remove stopwords         # decided against it as it removes sentimental information
7. Stemming the words
8. Lemmatizing words
9. Iterate through each element and drop if only contains numerics
10. Return a new DataFrame containing only the cleaned comments and labels
"""

# Helper methods
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

def tokenization(text):
    tokens = re.split("\W+",text)
    return tokens   

def remove_stopwords(text):
    # stopwords = stopwords[0:10]       # You can shrink down the list of stopwords that you want removed by slicing stopword 
    output= [i for i in text if i not in stopwords or i in unsafe_stopwords]
    return output

def stemming(text):
    porter_stemmer = PorterStemmer()
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text

def lemmatizer(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    endings = ["er", "est"]
    lemm_text = [word if any(word.endswith(ending) for ending in endings) else wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

def default_lemmatizer(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

def printDropNaNSummary(comment_initial, comment_final, label_initial, label_final):
  no_of_dropped_comments = comment_initial - comment_final
  print("Found {} missing data entry with NaN values in either col_name or 'label'".format(no_of_dropped_comments))
  print("\nAfter dropping missing and duplicate data:\n{} col --> {} col \n{} labels --> {} labels\n".format(comment_initial, 
                                                                                                        comment_final, 
                                                                                                        label_initial, 
                                                                                                        label_final))

def alphabet_rep(text):
  count = 0
  letter = text[0]
  for i in text:
    if count > 2 :
      return 1
    if i == letter:
      count += 1
    else:
      letter = i
  return 0

def exclamation_freq(text):
  count = 0
  for i in text:
    if i == "!":
      count += 1
  freq = count/len(text)
  return freq

def question_freq(text):
  count = 0
  for i in text:
    if i == "?":
      count += 1
  freq = count/len(text)
  return freq

def dots_freq(text):
  count = 0
  for i in text:
    if i == ".":
      count += 1
  freq = count/len(text)
  return freq

def caps_freq(text):
  count = 0
  for i in text:
    if i.isupper():
      count += 1
  return count/len(text)

def quotes_freq(text):
  count = 0
  for i in text:
    if i in  ("\'" ,"\""):
      count += 1
  freq = count/len(text)
  return freq

def consecutive_caps(text):
  n = len(text)
  count = 0
  for i in range(n):
    if (i < n - 1 and text[i].isupper() and text[i+1].isupper()):
      if i == 0 and text[i].isupper():
        count += 1
      if i > 0 and not text[i - 1].isupper():
        count += 1
      count += 1
  
  freq = count/len(text)
  return freq

def consecutive_dots(text):
  n = len(text)
  count = 0
  for i in range(n):
    if (i < n - 1 and text[i] == "." and text[i+1] == "."):
      if i == 0 and text[i] == ".":
        count += 1
      if i > 0 and not text[i - 1] == ".":
        count += 1
      count += 1
  
  freq = count/len(text)
  return freq

def consecutive_exclamations(text):
  n = len(text)
  count = 0
  for i in range(n):
    if (i < n - 1 and text[i] == "!" and text[i+1] == "!"):
      if i == 0 and text[i] == "!":
        count += 1
      if i > 0 and not text[i - 1] == "!":
        count += 1
      count += 1
  
  freq = count/len(text)
  return freq

def consecutive_question(text):
  n = len(text)
  count = 0
  for i in range(n):
    if (i < n - 1 and text[i] == "?" and text[i+1] == "?"):
      if i == 0 and text[i] == "?":
        count += 1
      if i > 0 and not text[i - 1] == "?":
        count += 1
      count += 1

  freq = count/len(text)
  return freq

def consecutive_punctuations(text):
  n = len(text)
  count = 0
  for i in range(n):
    if (i < n - 1 and text[i] in string.punctuation and text[i+1] in string.punctuation):
      if i == 0 and text[i] in string.punctuation:
        count += 1
      if i > 0 and not text[i - 1] in string.punctuation:
        count += 1
      count += 1

  freq = count/len(text)
  return freq

# Apply the cleaning
def text_preprocessing(df, col_name="comment", size="all", mode = [1,1,1,1,1,1,1]):
  if col_name == "":
    print("NO COL NAME SPECIFIED")
    return
  if size == "all": size = df.size
  df_split = df.head(size)
  df_copy = df_split.copy()

  initial_comment_length = df_copy[col_name].size
  initial_label_length = df_copy["label"].size

  df_copy = df_copy.dropna()                                                # Drop rows with missing data in any column

  comment_length_after_drop = df_copy[col_name].size
  label_length_after_drop = df_copy['label'].size
  if comment_length_after_drop != label_length_after_drop: return None     # Both series should have the same length

  # Apply meta features
  start = time.perf_counter()
  df_copy['meta_features'] = df["comment"].apply(lambda x: "0 0 0 0 0 0 0 0 0" if x != x else 
                                                 str(alphabet_rep(x)) + " " + 
                                                 str(exclamation_freq(x)) + " " +
                                                 str(dots_freq(x)) + " " +
                                                 str(question_freq(x)) + " " +
                                                 str(quotes_freq(x)) + " " +
                                                 str(consecutive_caps(x)) + " " + 
                                                 str(consecutive_exclamations(x)) + " " +
                                                 str(consecutive_question(x)) + " " +
                                                 str(consecutive_punctuations(x)))

  end = time.perf_counter()
  print("Completed extracting meta data ...... time taken: {}".format(end-start))

  # Remove punctuation
  if mode[0] == 1:
    start = time.perf_counter()
    df_copy[col_name] = df_copy[col_name].apply(lambda x:remove_punctuation(x))
    end = time.perf_counter()
    print("Completed removing punctuation ...... time taken: {}".format(end-start))

  # Lowercase all words
  if mode[1] == 1:
    start = time.perf_counter()
    df_copy[col_name] = df_copy[col_name].apply(lambda x: x.lower())
    end = time.perf_counter()
    print("Completed lowercasing ...... time taken: {}".format(end-start))

  # Tokenize sentence into small units
  if mode[2] == 1:
    start = time.perf_counter()
    df_copy[col_name] = df_copy[col_name].apply(lambda x: tokenization(x))
    end = time.perf_counter()
    print("Completed tokenizing ...... time taken: {}".format(end-start))

  # Remove stopwords
  if mode[3] == 1:
    start = time.perf_counter()
    df_copy[col_name] = df_copy[col_name].apply(lambda x:remove_stopwords(x))
    end = time.perf_counter()
    print("Completed removing stopwords ...... time taken: {}".format(end-start))

  # Stemming the words
  if mode[4] == 1:
    start = time.perf_counter()
    df_copy[col_name] = df_copy[col_name].apply(lambda x: stemming(x))
    end = time.perf_counter()
    print("Completed stemming ...... time taken: {}".format(end-start))

  # Lemmatizing words
  if mode[5] == 1:
    start = time.perf_counter()
    df_copy[col_name] = df_copy[col_name].apply(lambda x:lemmatizer(x))
    end = time.perf_counter()
    print("Completed lemmatizing ...... time taken: {}".format(end-start))
  elif mode[5] == 2:
    start = time.perf_counter()
    df_copy[col_name] = df_copy[col_name].apply(lambda x:default_lemmatizer(x))
    end = time.perf_counter()
    print("Completed lemmatizing ...... time taken: {}".format(end-start))

  # Drop numeric and empty comments
  if mode[6] == 1:
    start = time.perf_counter()
    mask = df_copy[col_name].apply(lambda x: all(y.isnumeric() for y in x))
    og_length = len(df_copy)
    indices = np.where(mask)
    if len(indices[0]) > 0:
      df_copy = df_copy.drop(df_copy.index[list(indices[0])])
    end = time.perf_counter()
    print("Completed dropping empty/numerical comments ...... time taken: {}".format(end-start))

  # Join if tokenized
  if mode[2] == 1:
    df_copy[col_name] = df_copy[col_name].apply(lambda x: ' '.join(x))

  df_copy = df_copy.drop(['author',	'subreddit',	'score',	'ups',	'downs',	'date',	'created_utc',	'parent_comment'], axis=1)
  print('\n***Completed***')
  print('─' * 100)
  return df_copy

In [None]:
# Order of array
# -----------------------------------------------------------------
# - Remove punctuation
# - Lowercase all words
# - Tokenize sentence into small units
# - Remove stopwords
# - Stemming the words
# - Lemmatizing words
# - Drop numeric and empty comments

size = "all"

# No pre-processing
no_preprocessing = text_preprocessing(datastore, "comment", size=size, mode=[0,0,0,0,0,0,0])
no_preprocessing.to_csv("drive/MyDrive/CS3244 Dataset/pre-processed/no_pre_processing.csv", encoding='utf-8', index=False)

# Remove punctuation
no_puncutation = text_preprocessing(datastore, "comment", size=size, mode=[1,0,0,0,0,0,0])
no_puncutation.to_csv("drive/MyDrive/CS3244 Dataset/pre-processed/no_puncutation.csv", encoding='utf-8', index=False)

# # Lowercase and remove safe stopwords
stopwords = stopwordprovider.words('english')
stopwords = stopwords      # You can shrink down the list of stopwords that you want removed by slicing stopword 
unsafe_stopwords = ['not', 'you', 'so', 'just', 'because', 'but', 'if', 'as', 'no', 'only', 'more', 'how', 'than', 'too', 'i', 'me', 'such', 'most', 'very',  ]
lowercase_no_stopwords_refined = text_preprocessing(datastore, "comment", size=size, mode=[0,1,1,1,0,0,0])
lowercase_no_stopwords_refined.to_csv("drive/MyDrive/CS3244 Dataset/pre-processed/lowercase_no_stopwords_refined.csv", encoding='utf-8', index=False)

# Lowercase and remove first 10 stopwords
stopwords = stopwordprovider.words('english')
stopwords = stopwords[0:10]       # You can shrink down the list of stopwords that you want removed by slicing stopword 
lowercase_no_stopwords = text_preprocessing(datastore, "comment", size=size, mode=[0,1,1,1,0,0,0])
lowercase_no_stopwords.to_csv("drive/MyDrive/CS3244 Dataset/pre-processed/lowercase_no_stopwords.csv", encoding='utf-8', index=False)

# Lowercase and lemmatized
lowercase_lemmatized = text_preprocessing(datastore, "comment", size=size, mode=[0,1,1,0,0,1,0])
lowercase_lemmatized.to_csv("drive/MyDrive/CS3244 Dataset/pre-processed/lowercase_lemmatized.csv", encoding='utf-8', index=False)

# Lowercase
lowercased = text_preprocessing(datastore, "comment", size=size, mode=[0,1,1,0,0,0,0])
lowercased.to_csv("drive/MyDrive/CS3244 Dataset/pre-processed/lowercased.csv", encoding='utf-8', index=False)

# Remove punctuation and drop numeric/empty comments
no_punctuation_numeric_empty = text_preprocessing(datastore, "comment", size=size, mode=[1,0,1,0,0,0,1])
no_punctuation_numeric_empty.to_csv("drive/MyDrive/CS3244 Dataset/pre-processed/no_punctuation_numeric_empty.csv", encoding='utf-8', index=False)

# Everything
unsafe_stopwords = []
everything = text_preprocessing(datastore, "comment", size=size, mode=[1,1,1,1,0,1,1])
everything.to_csv("drive/MyDrive/CS3244 Dataset/pre-processed/everything.csv", encoding='utf-8', index=False)

# Everything except punctuation
everything_except_punctuation = text_preprocessing(datastore, "comment", size=size, mode=[0,1,1,1,0,1,1])
everything_except_punctuation.to_csv("drive/MyDrive/CS3244 Dataset/pre-processed/everything_except_punctuation.csv", encoding='utf-8', index=False)

# Everything-stemming
unsafe_stopwords = []
everything_stemming = text_preprocessing(datastore, "comment", size=size, mode=[1,1,1,1,1,0,1])
everything_stemming.to_csv("drive/MyDrive/CS3244 Dataset/pre-processed/everything_stemming.csv", encoding='utf-8', index=False)

# Lowercase and default lemmatized
lowercase_default_lemmatized = text_preprocessing(datastore, "comment", size=size, mode=[0,1,1,0,0,2,0])
lowercase_default_lemmatized.to_csv("drive/MyDrive/CS3244 Dataset/pre-processed/lowercase_default_lemmatized.csv", encoding='utf-8', index=False)

Completed extracting meta data ...... time taken: 54.73973207899999

***Completed***
────────────────────────────────────────────────────────────────────────────────────────────────────
Completed extracting meta data ...... time taken: 51.93829261399998
Completed removing punctuation ...... time taken: 6.794852780000014

***Completed***
────────────────────────────────────────────────────────────────────────────────────────────────────
Completed extracting meta data ...... time taken: 51.05047398000002
Completed lowercasing ...... time taken: 0.3273212259999809
Completed tokenizing ...... time taken: 6.38494426799997
Completed removing stopwords ...... time taken: 29.041811886999994

***Completed***
────────────────────────────────────────────────────────────────────────────────────────────────────
Completed extracting meta data ...... time taken: 50.696658592999995
Completed lowercasing ...... time taken: 0.27651418199997124
Completed tokenizing ...... time taken: 6.158338190000052
Co

# Results using CNN Model 
The results are done using vocab_size = 10000, embedding_dim = 16, max_length = 120, batch_size = 32, epochs = 10 on CNN Model

https://colab.research.google.com/drive/1oBSJardm70rn2mzlYEfqHEmi-nFuD9lW?usp=sharing

## No preprocessing
Epoch 10/10
25270/25270 - 97s <br>- loss: 0.4846 - accuracy: 0.7667 <br>- val_loss: 0.5580 - <b>val_accuracy: 0.7181</b> <br>- 97s/epoch - 4ms/step

## lowercase_lemmatized
Epoch 10/10
25270/25270 - 107s <br>- loss: 0.4396 - accuracy: 0.7938 <br>- val_loss: 0.5903 - <b>val_accuracy: 0.7116</b> <br>- 107s/epoch - 4ms/step

## lowercase_no_stopwords
Epoch 10/10
25266/25266 - 408s <br>- loss: 0.4415 - accuracy: 0.7939 <br>- val_loss: 0.6042 - <b>val_accuracy: 0.7081</b> <br>- 408s/epoch - 16ms/step

## no_punctuation_numeric_empty
Epoch 10/10
25212/25212 - 96s <br>- loss: 0.4411 - accuracy: 0.7946 <br>- val_loss: 0.5970 - <b>val_accuracy: 0.7095</b> <br>- 96s/epoch - 4ms/step

## no_puncutation
Epoch 10/10
25244/25244 - 106s <br>- loss: 0.4411 - accuracy: 0.7937 <br>- val_loss: 0.5997 <b>- val_accuracy: 0.7081</b> <br>- 106s/epoch - 4ms/step
## no_pre_processing

## lowercased



## No preprocessing

Epoch 1/10
25270/25270 - 124s - loss: 0.5667 - accuracy: 0.7039 - val_loss: 0.5488 - val_accuracy: 0.7194 - 124s/epoch - 5ms/step
Epoch 2/10
25270/25270 - 111s - loss: 0.5362 - accuracy: 0.7292 - val_loss: 0.5431 - val_accuracy: 0.7235 - 111s/epoch - 4ms/step
Epoch 3/10
25270/25270 - 100s - loss: 0.5231 - accuracy: 0.7389 - val_loss: 0.5435 - val_accuracy: 0.7239 - 100s/epoch - 4ms/step
Epoch 4/10
25270/25270 - 98s - loss: 0.5139 - accuracy: 0.7462 - val_loss: 0.5453 - val_accuracy: 0.7240 - 98s/epoch - 4ms/step
Epoch 5/10
25270/25270 - 106s - loss: 0.5061 - accuracy: 0.7521 - val_loss: 0.5469 - val_accuracy: 0.7224 - 106s/epoch - 4ms/step
Epoch 6/10
25270/25270 - 98s - loss: 0.5004 - accuracy: 0.7563 - val_loss: 0.5513 - val_accuracy: 0.7204 - 98s/epoch - 4ms/step
Epoch 7/10
25270/25270 - 97s - loss: 0.4958 - accuracy: 0.7590 - val_loss: 0.5504 - val_accuracy: 0.7217 - 97s/epoch - 4ms/step
Epoch 8/10
25270/25270 - 111s - loss: 0.4914 - accuracy: 0.7620 - val_loss: 0.5538 - val_accuracy: 0.7204 - 111s/epoch - 4ms/step
Epoch 9/10
25270/25270 - 106s - loss: 0.4875 - accuracy: 0.7647 - val_loss: 0.5588 - val_accuracy: 0.7179 - 106s/epoch - 4ms/step
Epoch 10/10
25270/25270 - 97s - loss: 0.4846 - accuracy: 0.7667 - val_loss: 0.5580 - val_accuracy: 0.7181 - 97s/epoch - 4ms/step

## lowercase_lemmatized
Epoch 1/10
25270/25270 - 110s - loss: 0.5586 - accuracy: 0.7093 - val_loss: 0.5443 - val_accuracy: 0.7222 - 110s/epoch - 4ms/step
Epoch 2/10
25270/25270 - 106s - loss: 0.5264 - accuracy: 0.7349 - val_loss: 0.5499 - val_accuracy: 0.7163 - 106s/epoch - 4ms/step
Epoch 3/10
25270/25270 - 100s - loss: 0.5083 - accuracy: 0.7481 - val_loss: 0.5424 - val_accuracy: 0.7252 - 100s/epoch - 4ms/step
Epoch 4/10
25270/25270 - 107s - loss: 0.4935 - accuracy: 0.7586 - val_loss: 0.5473 - val_accuracy: 0.7250 - 107s/epoch - 4ms/step
Epoch 5/10
25270/25270 - 107s - loss: 0.4817 - accuracy: 0.7669 - val_loss: 0.5571 - val_accuracy: 0.7211 - 107s/epoch - 4ms/step
Epoch 6/10
25270/25270 - 98s - loss: 0.4707 - accuracy: 0.7742 - val_loss: 0.5595 - val_accuracy: 0.7180 - 98s/epoch - 4ms/step
Epoch 7/10
25270/25270 - 98s - loss: 0.4616 - accuracy: 0.7798 - val_loss: 0.5695 - val_accuracy: 0.7165 - 98s/epoch - 4ms/step
Epoch 8/10
25270/25270 - 97s - loss: 0.4533 - accuracy: 0.7853 - val_loss: 0.5704 - val_accuracy: 0.7149 - 97s/epoch - 4ms/step
Epoch 9/10
25270/25270 - 106s - loss: 0.4462 - accuracy: 0.7896 - val_loss: 0.5816 - val_accuracy: 0.7126 - 106s/epoch - 4ms/step
Epoch 10/10
25270/25270 - 107s - loss: 0.4396 - accuracy: 0.7938 - val_loss: 0.5903 - val_accuracy: 0.7116 - 107s/epoch - 4ms/step

## lowercase_no_stopwords
Epoch 1/10
25266/25266 - 403s - loss: 0.5612 - accuracy: 0.7073 - val_loss: 0.5451 - val_accuracy: 0.7205 - 403s/epoch - 16ms/step
Epoch 2/10
25266/25266 - 431s - loss: 0.5289 - accuracy: 0.7334 - val_loss: 0.5411 - val_accuracy: 0.7232 - 431s/epoch - 17ms/step
Epoch 3/10
25266/25266 - 419s - loss: 0.5110 - accuracy: 0.7466 - val_loss: 0.5465 - val_accuracy: 0.7228 - 419s/epoch - 17ms/step
Epoch 4/10
25266/25266 - 410s - loss: 0.4960 - accuracy: 0.7575 - val_loss: 0.5497 - val_accuracy: 0.7211 - 410s/epoch - 16ms/step
Epoch 5/10
25266/25266 - 414s - loss: 0.4837 - accuracy: 0.7665 - val_loss: 0.5622 - val_accuracy: 0.7183 - 414s/epoch - 16ms/step
Epoch 6/10
25266/25266 - 423s - loss: 0.4728 - accuracy: 0.7739 - val_loss: 0.5593 - val_accuracy: 0.7170 - 423s/epoch - 17ms/step
Epoch 7/10
25266/25266 - 416s - loss: 0.4635 - accuracy: 0.7803 - val_loss: 0.5694 - val_accuracy: 0.7126 - 416s/epoch - 16ms/step
Epoch 8/10
25266/25266 - 408s - loss: 0.4553 - accuracy: 0.7849 - val_loss: 0.5726 - val_accuracy: 0.7110 - 408s/epoch - 16ms/step
Epoch 9/10
25266/25266 - 418s - loss: 0.4479 - accuracy: 0.7898 - val_loss: 0.5975 - val_accuracy: 0.7103 - 418s/epoch - 17ms/step
Epoch 10/10
25266/25266 - 408s - loss: 0.4415 - accuracy: 0.7939 - val_loss: 0.6042 - val_accuracy: 0.7081 - 408s/epoch - 16ms/step

## no_punctuation_numeric_empty
Epoch 1/10
25212/25212 - 107s - loss: 0.5603 - accuracy: 0.7084 - val_loss: 0.5430 - val_accuracy: 0.7228 - 107s/epoch - 4ms/step
Epoch 2/10
25212/25212 - 106s - loss: 0.5286 - accuracy: 0.7333 - val_loss: 0.5392 - val_accuracy: 0.7251 - 106s/epoch - 4ms/step
Epoch 3/10
25212/25212 - 97s - loss: 0.5108 - accuracy: 0.7468 - val_loss: 0.5413 - val_accuracy: 0.7260 - 97s/epoch - 4ms/step
Epoch 4/10
25212/25212 - 97s - loss: 0.4962 - accuracy: 0.7575 - val_loss: 0.5505 - val_accuracy: 0.7203 - 97s/epoch - 4ms/step
Epoch 5/10
25212/25212 - 97s - loss: 0.4838 - accuracy: 0.7661 - val_loss: 0.5553 - val_accuracy: 0.7176 - 97s/epoch - 4ms/step
Epoch 6/10
25212/25212 - 98s - loss: 0.4731 - accuracy: 0.7735 - val_loss: 0.5598 - val_accuracy: 0.7163 - 98s/epoch - 4ms/step
Epoch 7/10
25212/25212 - 95s - loss: 0.4639 - accuracy: 0.7794 - val_loss: 0.5708 - val_accuracy: 0.7155 - 95s/epoch - 4ms/step
Epoch 8/10
25212/25212 - 97s - loss: 0.4555 - accuracy: 0.7856 - val_loss: 0.5782 - val_accuracy: 0.7141 - 97s/epoch - 4ms/step
Epoch 9/10
25212/25212 - 107s - loss: 0.4480 - accuracy: 0.7898 - val_loss: 0.5889 - val_accuracy: 0.7097 - 107s/epoch - 4ms/step
Epoch 10/10
25212/25212 - 96s - loss: 0.4411 - accuracy: 0.7946 - val_loss: 0.5970 - val_accuracy: 0.7095 - 96s/epoch - 4ms/step

## no_puncutation
Epoch 1/10
25244/25244 - 98s - loss: 0.5596 - accuracy: 0.7092 - val_loss: 0.5450 - val_accuracy: 0.7201 - 98s/epoch - 4ms/step
Epoch 2/10
25244/25244 - 106s - loss: 0.5265 - accuracy: 0.7348 - val_loss: 0.5431 - val_accuracy: 0.7226 - 106s/epoch - 4ms/step
Epoch 3/10
25244/25244 - 107s - loss: 0.5089 - accuracy: 0.7480 - val_loss: 0.5437 - val_accuracy: 0.7233 - 107s/epoch - 4ms/step
Epoch 4/10
25244/25244 - 97s - loss: 0.4946 - accuracy: 0.7576 - val_loss: 0.5503 - val_accuracy: 0.7200 - 97s/epoch - 4ms/step
Epoch 5/10
25244/25244 - 98s - loss: 0.4825 - accuracy: 0.7665 - val_loss: 0.5644 - val_accuracy: 0.7161 - 98s/epoch - 4ms/step
Epoch 6/10
25244/25244 - 105s - loss: 0.4724 - accuracy: 0.7729 - val_loss: 0.5623 - val_accuracy: 0.7149 - 105s/epoch - 4ms/step
Epoch 7/10
25244/25244 - 107s - loss: 0.4629 - accuracy: 0.7793 - val_loss: 0.5702 - val_accuracy: 0.7134 - 107s/epoch - 4ms/step
Epoch 8/10
25244/25244 - 106s - loss: 0.4549 - accuracy: 0.7849 - val_loss: 0.5845 - val_accuracy: 0.7108 - 106s/epoch - 4ms/step
Epoch 9/10
25244/25244 - 106s - loss: 0.4476 - accuracy: 0.7896 - val_loss: 0.5936 - val_accuracy: 0.7098 - 106s/epoch - 4ms/step
Epoch 10/10
25244/25244 - 106s - loss: 0.4411 - accuracy: 0.7937 - val_loss: 0.5997 - val_accuracy: 0.7081 - 106s/epoch - 4ms/step

## lowercased

Epoch 1/10
25270/25270 - 100s - loss: 0.5588 - accuracy: 0.7092 - val_loss: 0.5441 - val_accuracy: 0.7219 - 100s/epoch - 4ms/step
Epoch 2/10
25270/25270 - 105s - loss: 0.5259 - accuracy: 0.7352 - val_loss: 0.5393 - val_accuracy: 0.7265 - 105s/epoch - 4ms/step
Epoch 3/10
25270/25270 - 106s - loss: 0.5074 - accuracy: 0.7490 - val_loss: 0.5420 - val_accuracy: 0.7267 - 106s/epoch - 4ms/step
Epoch 4/10
25270/25270 - 104s - loss: 0.4927 - accuracy: 0.7595 - val_loss: 0.5476 - val_accuracy: 0.7234 - 104s/epoch - 4ms/step
Epoch 5/10
25270/25270 - 97s - loss: 0.4804 - accuracy: 0.7681 - val_loss: 0.5549 - val_accuracy: 0.7223 - 97s/epoch - 4ms/step
Epoch 6/10
25270/25270 - 105s - loss: 0.4695 - accuracy: 0.7750 - val_loss: 0.5577 - val_accuracy: 0.7197 - 105s/epoch - 4ms/step
Epoch 7/10
25270/25270 - 96s - loss: 0.4602 - accuracy: 0.7808 - val_loss: 0.5708 - val_accuracy: 0.7170 - 96s/epoch - 4ms/step
Epoch 8/10
25270/25270 - 107s - loss: 0.4522 - accuracy: 0.7863 - val_loss: 0.5718 - val_accuracy: 0.7148 - 107s/epoch - 4ms/step
Epoch 9/10
25270/25270 - 106s - loss: 0.4448 - accuracy: 0.7907 - val_loss: 0.5835 - val_accuracy: 0.7140 - 106s/epoch - 4ms/step
Epoch 10/10
25270/25270 - 98s - loss: 0.4384 - accuracy: 0.7946 - val_loss: 0.5988 - val_accuracy: 0.7121 - 98s/epoch - 4ms/step