In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Paths to data files and output file on drive
train_data_file = '/content/drive/MyDrive/train_data.json'
val_data_file = '/content/drive/MyDrive/valid_data.json' #needs to be modified for test file
pred_out_file = '/content/drive/MyDrive/prediction_out.json'

In [3]:
# Import all dependencies
import numpy as np
import pandas as pd
import spacy
import json
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration

nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words


model = TFPegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

In [4]:
df_train_init = pd.read_json(train_data_file)
df_val_init = pd.read_json(val_data_file)

display(df_val_init.head(5))

Unnamed: 0,conclusion,id,argument
0,Slang prevents young people to become potentia...,51640,People want to hire potential employers that a...
1,"Tesla Motors should have a dealer network, eve...",61343,Tesla Motors have mostly been in the business ...
2,"Earthlings could be ""early bloomers"", the firs...",93408,Radiation in the cosmos has decreased That has...
3,I don't think income equality is a problem.,108747,I don't think income inequality is a problem. ...
4,A UBI wouldn't have to be paid for by taxes. A...,47729,An economy using a fiat currency is not revenu...


In [None]:
# Contraction mapping dictionary obtained from internet

contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                          "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}


In [12]:
# Function to Preprocess the text data: Lower Casing, Remove URLs, punctuations and stripping extra spaces
def PreprocessData(df):
  for column in ['conclusion','argument']:
      df['clean_' + column] = df[column].str.lower()
      df['clean_' + column] = df['clean_' + column].str.replace('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ') #remove URL
      df['clean_' + column] = df['clean_' + column].apply(lambda x: ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in x.split()]))
      df['clean_' + column] = df['clean_' + column].str.strip()
      df['clean_' + column] = df['clean_' + column].str.replace('[^\w\s]','')
      df['clean_' + column] = df['clean_' + column].str.replace('\n','')
      # df['clean_' + column] = df['clean_' + column].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
  return df

# df['response'] = df['response'].apply(lambda x: [item for item in x.split() if item not in stop])

In [13]:
df_train_clean = PreprocessData(df_train_init)
df_val_clean = PreprocessData(df_val_init)

In [14]:
display(df_val_clean.head(5))

Unnamed: 0,conclusion,id,argument,clean_conclusion,clean_argument
0,Slang prevents young people to become potentia...,51640,People want to hire potential employers that a...,slang prevents young people to become potentia...,people want to hire potential employers that a...
1,"Tesla Motors should have a dealer network, eve...",61343,Tesla Motors have mostly been in the business ...,tesla motors should have a dealer network even...,tesla motors have mostly been in the business ...
2,"Earthlings could be ""early bloomers"", the firs...",93408,Radiation in the cosmos has decreased That has...,earthlings could be early bloomers the first t...,radiation in the cosmos has decreased that has...
3,I don't think income equality is a problem.,108747,I don't think income inequality is a problem. ...,i do not think income equality is a problem,i do not think income inequality is a problem ...
4,A UBI wouldn't have to be paid for by taxes. A...,47729,An economy using a fiat currency is not revenu...,a ubi would not have to be paid for by taxes a...,an economy using a fiat currency is not revenu...


In [15]:
# # Argument tokenizer

# # Fit the tokenzier on train data
# arg_tokenizer = Tokenizer(oov_token=1)
# arg_tokenizer.fit_on_texts(df_train_clean['clean_argument'].values)

# # Transform the texts to integer sequences - train and val data
# x_train_arg = arg_tokenizer.texts_to_sequences(df_train_clean['clean_argument'].values)
# x_val_arg = arg_tokenizer.texts_to_sequences(df_val_clean['clean_argument'].values)

# # Pad length for uniformity
# max_length = max(len(s.split()) for s in df_train_clean['clean_argument'].values)
# x_train_arg = pad_sequences(x_train_arg,maxlen=max_length)
# x_val_arg = pad_sequences(x_val_arg,maxlen=max_length)

# # Conclusion tokenizer

# # Fit the tokenzier on train data
# conc_tokenizer = Tokenizer(oov_token=1)
# conc_tokenizer.fit_on_texts(df_train_clean['clean_conclusion'].values)

# # Transform the texts to integer sequences - train and val data
# x_train_conc = conc_tokenizer.texts_to_sequences(df_train_clean['clean_conclusion'].values)
# x_val_conc = conc_tokenizer.texts_to_sequences(df_val_clean['clean_conclusion'].values)

# # Pad length for uniformity
# max_length = max(len(s.split()) for s in df_train_clean['clean_conclusion'].values)
# x_train_conc = pad_sequences(x_train_conc,maxlen=max_length)
# x_val_conc = pad_sequences(x_val_conc,maxlen=max_length)