In [7]:
# ------------------------- # 
#        SET - UP           # 
# ------------------------- # 

# ---- Requirements ----- # 

import sys
from google.colab import drive
import pandas as pd
import numpy as np
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers.optimization import Adafactor, AdafactorSchedule
import torch
import huggingface_hub
from datasets import load_dataset, Dataset, load_metric
import nltk
nltk.download("punkt")
import gc
import random
from torch import nn 

# ----- Mounting Google Drive ----- # 

drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/CIS6930_final')

# ----------------------------------------------------------------------

# ----- Reading in the Dataset
train = pd.read_csv('/content/drive/MyDrive/CIS6930_final/tweetsum_train.csv')
valid = pd.read_csv('/content/drive/MyDrive/CIS6930_final/tweetsum_valid.csv')
test = pd.read_csv('/content/drive/MyDrive/CIS6930_final/tweetsum_test.csv')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Exploratory Analysis 
Brief exploratory data analysis to familiarize myself with the data. 

**Components:**


1.   Length
2.   Vocabulary size 
3.   Manual inspection of the data



In [11]:
# ------------------------- # 
#     VARIABLE LENGTH       # 
# ------------------------- # 

# 5 number summary for length of inputs/summaries across datasets 

for i in [train, valid, test]:
  i["input_length"] = i.inputs.str.len()
  i["summary_length"] = i.summaries.str.len()
  # print(f"Summary:\n{i.summary_length.describe()}\nInputs:\n{i.input_length.describe()}")

# For all of the data: 
# --- Approximate how much information loss there will be when the inputs are truncated 
# --- in the tokenizer. Input length for all models will be held consistent to allow for 
# --- comparisions of across-model performance 

temp = pd.concat([train, valid, test])
print(f"Summary:\n{temp.summary_length.describe()}\nInputs:\n{temp.input_length.describe()}")

Summary:
count    1087.000000
mean      193.574057
std        63.335348
min        72.000000
25%       146.000000
50%       182.000000
75%       226.000000
max       515.000000
Name: summary_length, dtype: float64
Inputs:
count    1087.000000
mean     1105.671573
std       413.260190
min       419.000000
25%       831.000000
50%      1013.000000
75%      1270.500000
max      3484.000000
Name: input_length, dtype: float64


In [13]:
# ------------------------- # 
#     VOCABULARY SIZE       # 
# ------------------------- # 

summary_vocab_size = len(set(' '.join(temp.summaries.to_list()).split(' ')))
input_vocab_size = len(set(' '.join(temp.inputs.to_list()).split(' ')))
print(f"VOCABULARY SIZE\nSummary vocabulary size: {summary_vocab_size}\nInput vocabulary size: {input_vocab_size}")


VOCABULARY SIZE
Summary vocabulary size: 4673
Input vocabulary size: 19817


In [None]:
# --------------------- # 
#    QUALITY CONTROL    #
# --------------------- #

# The quality of a automatic summary depends on the quality of the ground truth 
# summaries it is provided. The old adage, "garbage in, garbage out", is particularly
# relevant here. I randomly sample 20 summaries to inspect the quality of. This is,
# of course, being subjective. If this were a project with a greater scrope than a term
# project, I would likely have others looking at the summaries and assigning quality scores, 
# to gauge inter-rater reliability. However, it is just me. 

# ----------------------------------------------------------------------------------

# Thoughts on summaries themselves: 
# The quality of the summaries is sub-par. There are frequent spelling mistakes, 
# grammatical errors, and misused words. I am concerned about the quality of the potential
# results, from an interpretability standpoint. There are also a surplus of examples that
# start with the phrase: "Customer is complaining". If the model is able to pick up
# on this pattern, it could inflate the scores of metrics which depend on shared n-grams. 
# Proper capitalization is also inconsistent. 

# Examples: 
# "Customer equires" -- should be "inquires"
# "Customer is disappointed for the delay of the products for two days" -- improper grammar
# "Customer is complaining  about the why insn't the weather's widget is not working" -- many issues

import random
sample = random.sample(range(0, len(temp.summaries)), 20)
#print(temp.summaries.iloc[sample].to_list())

# ----------------------------------------------------------------------------------

# Do the summaries reflect the original conversation?

# Thoughts:
# Despite what I generally believe is poor summary quality, in terms of "readability", the 
# ground truth summaries do reflect the key points of the dialogues. 

for i in sample: 
  print(f"Conversation: {temp.inputs.iloc[i]}")
  print(f"Summary: {temp.summaries.iloc[i]}\n")

# ----------------------------------------------------------------------------------