# Data Loading, Cleaning and Wrangling

In [1]:
#Need to run this to have the right path
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Installing the right stuff
!pip install -q sentencepiece
!pip install -q transformers
!pip install -q evaluate
!pip install -q rouge_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [3]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import evaluate
from pprint import pprint

In [4]:
#Building path and opening the files to create the dataframe
data_path = "/content/drive/MyDrive/266 NLP - Final Project/all_the_news_SUBSET_Kaggle/"

df1 = pd.read_csv(data_path + "articles1.csv").iloc[:, 1:]
df2 = pd.read_csv(data_path + "articles2.csv").iloc[:, 1:]
df3 = pd.read_csv(data_path + "articles3.csv").iloc[:, 1:]

data = pd.concat([df1, df2, df3])

In [5]:
### Data Characteristics
## Each id is unique to each row/headline with no duplicate articles/rows
## 142,570 total headline and articles

## 1.85% of the data doesn't have dates but I feel like it's ok to get rid of them since it's such a small sample
data[data["date"].isnull()].shape[0]/data.shape[0]
data = data[~data["date"].isnull()]

## Out of all the articles, 99.7% of them are from 2015-2017 so it might not be helpful to keep the older articles before 2015 since there are so few
data.value_counts("year").sort_index(ascending=False)
data = data[data["year"]>2014]

data = data.reset_index()

#Added word count column
data['article_wd_ct'] = data['content'].str.split().str.len() ## word count of each article

## Only subsetting for article that have at least 100 words- could do more analysis into including ones between 70-100 but the ones I've seen in that range aren't fully populated articles or poorly extracted articles
data = data[data["article_wd_ct"]>=100]

## Further subsetting the data for articles below 500 words for ease of running the models
data = data[data["article_wd_ct"]<=500]

Data Cleaning for Headlines.

A manual process done publication by publication

In [6]:
#Atlantic
data = data.drop(data.loc[(data.publication=="Atlantic") & (data.title.str.contains("Atlantic"))].index)

#Breitbart
data.loc[((data.publication=="Breitbart") & (data.title.str.contains("- Breitbart"))), "title"] = data[(data["publication"] == "Breitbart") & (data["title"].str.contains("- Breitbart"))].title.str[:-12]
data = data.drop(data.loc[(data.publication=="Breitbart") & ( (data.title.str.contains("Breitbart")) | (data.title.str.contains("BREITBART")) )].index)

#Business Insider
data = data.drop(data.loc[(data.publication=="Business Insider") & ((data.title.str.contains("Business Insider")) | (data.title.str.contains("BUSINESS INSIDER")) )].index)

#BuzzFeed
data = data.drop(data.loc[(data.publication=="Buzzfeed News") & ((data.title.str.contains("BuzzFeed")) | (data.title.str.contains("BUZZFEED")) )].index)

#CNN
data = data.drop(data.loc[(data.publication=="CNN") & (data.title.str.contains("CNN"))].index)

#Fox News
#Fix all the aforementioned appendings. Data frame should be equal size to above, 53089, can use that to validate.
data.loc[((data.publication=="Fox News") & (data.title.str.contains("Fox News Poll:"))), "title"] = data[(data["publication"] == "Fox News") & (data["title"].str.contains("Fox News Poll:"))].title.str[15:]
data.loc[((data.publication=="Fox News") & (data.title.str.contains("Fox News Electoral Scorecard:"))), "title"] = data[(data["publication"] == "Fox News") & (data["title"].str.contains("Fox News Electoral Scorecard:"))].title.str[30:]
data.loc[((data.publication=="Fox News") & (data.title.str.contains("Fox News projects:"))), "title"] = data[(data["publication"] == "Fox News") & (data["title"].str.contains("Fox News projects:"))].title.str[19:]
data = data.drop(data.loc[(data.publication=="Fox News") & ((data.title.str.contains("Fox")) | (data.title.str.contains("FOX")) )].index)

#Guardian
data = data.drop(data.loc[(data.publication=="Guardian") & (data.title.str.contains("Guardian"))].index)

#National Review
data = data.drop(data.loc[(data.publication=="National Review") & ( (data.title.str.contains("NR ")) | (data.title.str.contains("NR:")) | (data.title.str.contains("NRI")) | (data.title.str.contains("NRO")) )].index)

#New York Post
data = data.drop(data.loc[(data.publication=="New York Post") & (data.title.str.contains("New York Post"))].index)

#New York Times
data.loc[data.publication=="New York Times", "title"] = data[data['publication'] == "New York Times"].title.str[:-21]
data = data.drop(data.loc[(data.publication=="New York Times") & (data.title.str.contains("New York Times"))].index)

#NPR
data = data.drop(data.loc[(data.publication=="NPR") & (data.title.str.contains("NPR"))].index)

#Reuters
data = data.drop(data.loc[(data.publication=="Reuters") & (data.title.str.contains("Reuters"))].index)

#Talking Points Memo
data = data.drop(data.loc[(data.publication=="Talking Points Memo") & (data.title.str.contains("TPM"))].index)

#Vox
data = data.drop(data.loc[(data.publication=="Vox") & (data.title.str.contains("Vox"))].index)

#Washington Post
data = data.drop(data.loc[(data.publication=="Washington Post") & (data.title.str.contains("Washington Post"))].index)

In [7]:
#Totals got iffy, but it's at 52959
data.reset_index()

data

#Do not uncomment unless you want to download
#data.to_excel("CLEANED compact.xlsx")

Unnamed: 0,index,id,title,publication,author,date,year,month,url,content,article_wd_ct
5,5,17288,"Sick With a Cold, Queen Elizabeth Misses New Y...",New York Times,Sewell Chan,2017-01-02,2017.0,1.0,,"LONDON — Queen Elizabeth II, who has been b...",159
19,19,17303,Fecal Pollution Taints Water at Melbourne’s Be...,New York Times,Brett Cole,2017-01-03,2017.0,1.0,,"SYDNEY, Australia — The annual beach pilgri...",292
33,33,17323,"Airline Pilot, Believed to Be Drunk, Is Pulled...",New York Times,Ian Austen,2017-01-03,2017.0,1.0,,OTTAWA — It was 7 a. m. and 99 passengers a...,410
47,47,17338,It’s Time to Ignore Advice About Which Stocks ...,New York Times,Damon Darlin,2017-01-09,2017.0,1.0,,It’s that time of year when financial advice s...,395
52,52,17344,Chinese City Official Shoots 2 Others and Kill...,New York Times,Chris Buckley and Adam Wu,2017-01-05,2017.0,1.0,,BEIJING — A city official in southwest Chin...,358
...,...,...,...,...,...,...,...,...,...,...,...
139478,42535,218042,Debbie Reynolds was a trouper all the way to t...,Washington Post,Ann Hornaday,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161230000842/htt...,Y ou can almost hear Carrie Fishe...,367
139487,42544,218051,An obscure E.U. regulation may have saved live...,Washington Post,Adam Taylor,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161230000842/htt...,The truck attack on a Christmas market in...,387
139492,42549,218056,A woman ordered canaries to brighten her home....,Washington Post,Avi Selk,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161230000842/htt...,"Somewhere between Texas and Alabama, some...",373
139494,42551,218063,"Trump praises Putin’s response to sanctions, c...",Washington Post,Karoun Demirjian,2016-12-30,2016.0,12.0,https://web.archive.org/web/20161231004909/htt...,Donald Trump on Friday expressed his ap...,314


#T5 Headline Evaluation

In [8]:
#Import all necessary libraries here
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

#Initializing model and tokenizer
t5model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
t5tokenizer = T5Tokenizer.from_pretrained("t5-base")

#Rouge evaluator
rouge = evaluate.load('rouge')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [9]:
#data[data['publication'] == 'New York Times'].iloc[0:100]

In [10]:
for i in data.publication.unique():
  print(i)
  #print(data[data['publication'] == i].iloc[0:5, 2:4])

New York Times
Breitbart
CNN
Business Insider
Atlantic
Fox News
Talking Points Memo
Buzzfeed News
National Review
New York Post
Guardian
NPR
Reuters
Vox
Washington Post


In [11]:
#Current batch
publication = "Washington Post"

token_range = 5
number_of_beams_parameter = 3
no_repeat_ngram_size_parameter = 2

In [12]:
data[data['publication'] == publication].iloc[0:100]

Unnamed: 0,index,id,title,publication,author,date,year,month,url,content,article_wd_ct
128409,31464,204040,Is a Trump-Putin Pact on the horizon?,Washington Post,Colbert I. King,2016-12-31,2016.0,12.0,https://web.archive.org/web/20170101001641/htt...,Could there be a Pact in our future? Could P...,351
128421,31476,204053,"5 habits worth cultivating in 2017, according ...",Washington Post,Carrie Dennett,2016-12-29,2016.0,12.0,https://web.archive.org/web/20170101001641/htt...,Whether you make formal New Year’s resolution...,304
128422,31477,204054,Looking back at New Year’s Eve in Times Square,Washington Post,Karly Domb Sadof,2016-12-30,2016.0,12.0,https://web.archive.org/web/20170101001641/htt...,1930. (Associated Press) “Auld Lang S...,235
128439,31494,204087,Ethics advocates warn Trump that he needs to d...,Washington Post,Tom Hamburger,2017-01-02,2017.0,1.0,https://web.archive.org/web/20170103000951/htt...,A bipartisan group of ethics advocates lobbed ...,343
128451,31506,204099,Israeli police question PM over corruption all...,Washington Post,Griff Witte,2017-01-02,2017.0,1.0,https://web.archive.org/web/20170103000951/htt...,JERUSALEM — Israeli police investigators ...,197
...,...,...,...,...,...,...,...,...,...,...,...
129223,32278,205035,These photos show what the U.S.-Mexican border...,Washington Post,Karly Domb Sadof,2017-02-06,2017.0,2.0,https://web.archive.org/web/20170207002306/htt...,A house on the U. S. border in the Ri...,469
129235,32290,205050,This monster tornado just rolled through New O...,Washington Post,Angela Fritz,2017-02-07,2017.0,2.0,https://web.archive.org/web/20170208002026/htt...,Severe thunderstorms barreled through...,162
129252,32307,205073,Nor’easter to slam Northeast Thursday with thu...,Washington Post,Jason Samenow,2017-02-08,2017.0,2.0,https://web.archive.org/web/20170209002805/htt...,A powerful winter storm is predicted to h...,425
129259,32314,205080,Turning ordinary into magical: Amateur photogr...,Washington Post,Kenneth Dickerman,2017-02-08,2017.0,2.0,https://web.archive.org/web/20170209002805/htt...,"(Swarat Ghosh) More often than not, p...",420


In [13]:
#Headlines of the current batch
headlines = []

#Generated summaries of the current batch
generated_summary = []

#Rouge scores of the current batch
rouge_scores = []

In [14]:
#T5 is trained on several tasks, not just summarization. So we prepend the input with this to let the model know we want to summarie
PROMPT = 'summarize: '

#The range of
for i in range(100):

  #Getting current row
  Current_Article = data[data['publication'] == publication].iloc[i]

  #Article Content
  ARTICLE_TO_SUMMARIZE = Current_Article.content

  #Headline as reference
  REFERENCE = Current_Article.title
  headlines.append(REFERENCE)

  #Need to prompt BERT since it's trained on several tasks
  T5ARTICLE_TO_SUMMARIZE = PROMPT + ARTICLE_TO_SUMMARIZE

  #Tokenizing the input, aka prepping to put it into BERT
  inputs = t5tokenizer(T5ARTICLE_TO_SUMMARIZE, max_length=1024, truncation=True, return_tensors="tf")

  #Generating summary
  summary_ids = t5model.generate(inputs["input_ids"],
                                 num_beams=number_of_beams_parameter,
                                 max_length=len(t5tokenizer(REFERENCE)['input_ids']) + token_range,
                                 min_length=len(t5tokenizer(REFERENCE)['input_ids']) - token_range,
                                 no_repeat_ngram_size=no_repeat_ngram_size_parameter
                                 )
  #Decoding
  candidate = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
  generated_summary.append(candidate[0])

  predictions = candidate
  references = [REFERENCE]
  results = rouge.compute(predictions=predictions,
                          references=references)
  rouge_scores.append(results)

  print(str(i), " done")

0  done
1  done
2  done
3  done
4  done
5  done
6  done
7  done
8  done
9  done
10  done
11  done
12  done
13  done
14  done
15  done
16  done
17  done
18  done
19  done
20  done
21  done
22  done
23  done
24  done
25  done
26  done
27  done
28  done
29  done
30  done
31  done
32  done
33  done
34  done
35  done
36  done
37  done
38  done
39  done
40  done
41  done
42  done
43  done
44  done
45  done
46  done
47  done
48  done
49  done
50  done
51  done
52  done
53  done
54  done
55  done
56  done
57  done
58  done
59  done
60  done
61  done
62  done
63  done
64  done
65  done
66  done
67  done
68  done
69  done
70  done
71  done
72  done
73  done
74  done
75  done
76  done
77  done
78  done
79  done
80  done
81  done
82  done
83  done
84  done
85  done
86  done
87  done
88  done
89  done
90  done
91  done
92  done
93  done
94  done
95  done
96  done
97  done
98  done
99  done


In [15]:
headlines

['Is a Trump-Putin Pact on the horizon?',
 '5 habits worth cultivating in 2017, according to a dietitian',
 'Looking back at New Year’s Eve in Times Square',
 'Ethics advocates warn Trump that he needs to do more to divest from family business',
 'Israeli police question PM over corruption allegations',
 'Seldom-seen photos show what America looked like in the 1940s…in color',
 'Megyn Kelly leaving Fox News for NBC',
 'Militant who denounced Islamic State faces murder, war crimes charges in Germany',
 'A man got a middle seat on a 13-hour flight. Passengers recorded his ‘fit of rage,’ then arrest.',
 'Mysterious radio burst came from a galaxy 2.5 billion light years away, astronomers discover',
 'Berlin attacker had 14 aliases, was subject of repeated counterterror briefings',
 'Obama administration begins final tranche of Guantanamo prisoner transfers',
 'Why losing Megyn Kelly probably won’t even dent Fox News’s armor',
 'How Donald Trump totally destroyed Chris Christie',
 'Mexicans

In [16]:
generated_summary

['could there be a nonaggression agreement between the u.s. and russia? will the',
 'cook more Creating and serving even the simplest of meals is a profound way of caring',
 'since 1907, there have been only two Januaries — 1942 and 1943',
 'the letter comes after Trump announced some changes in his business empire, with more expected before Inauguration Day',
 'no charges have been filed against prime minister Benjamin Netanyahu .',
 'the farm security administration (FSA) was created in 1937 from an earlier agency named the Resettlement Administration,',
 'a person at NBC News confirmed that Kelly will leave the network for',
 'german citizen Harry Sarfo appeared in front page articles and television broadcasts last year in which he offered',
 "the man boarded a United Airlines flight on new year's day . he was seated between two of them, and didn't like their cross",
 'dim dwarf galaxy 2. 5 billion light years from earth is sending out the mysterious blasts of radio waves . the burs

In [17]:
rouge_scores

[{'rouge1': 0.18181818181818182,
  'rouge2': 0.0,
  'rougeL': 0.18181818181818182,
  'rougeLsum': 0.18181818181818182},
 {'rouge1': 0.07692307692307693,
  'rouge2': 0.0,
  'rougeL': 0.07692307692307693,
  'rougeLsum': 0.07692307692307693},
 {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0},
 {'rouge1': 0.1818181818181818,
  'rouge2': 0.0,
  'rougeL': 0.1212121212121212,
  'rougeLsum': 0.1212121212121212},
 {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0},
 {'rouge1': 0.13333333333333333,
  'rouge2': 0.0,
  'rougeL': 0.13333333333333333,
  'rougeLsum': 0.13333333333333333},
 {'rouge1': 0.4, 'rouge2': 0.0, 'rougeL': 0.2, 'rougeLsum': 0.2},
 {'rouge1': 0.06666666666666667,
  'rouge2': 0.0,
  'rougeL': 0.06666666666666667,
  'rougeLsum': 0.06666666666666667},
 {'rouge1': 0.22727272727272727,
  'rouge2': 0.0,
  'rougeL': 0.1818181818181818,
  'rougeLsum': 0.1818181818181818},
 {'rouge1': 0.5,
  'rouge2': 0.2941176470588235,
  'rougeL': 0.3333333333333333,
  'rou

In [18]:
import statistics
rouge1 = []
rouge2 = []
rougeL = []
rougeLsum = []

In [19]:
for i in rouge_scores:
  rouge1.append(i['rouge1'])
  rouge2.append(i['rouge2'])
  rougeL.append(i['rougeL'])
  rougeLsum.append(i['rougeLsum'])

In [20]:
print("Variance of Rouge1: ", np.var(rouge1))
print("Variance of Rouge2: ", np.var(rouge2))
print("Variance of RougeL: ", np.var(rougeL))
print("Variance of RougeLSum: ", np.var(rougeLsum))

print("Median of Rouge1: ", statistics.median(rouge1))
print("Median of Rouge2: ", statistics.median(rouge2))
print("Median of RougeL: ", statistics.median(rougeL))
print("Median of RougeLSum: ", statistics.median(rougeLsum))

print("Average of Rouge1: ", sum(rouge1)/len(rouge1))
print("Average of Rouge2: ", sum(rouge2)/len(rouge2))
print("Average of RougeL: ", sum(rougeL)/len(rougeL))
print("Average of RougeLSum: ", sum(rougeLsum)/len(rougeLsum))

print("Max of Rouge1: ", max(rouge1))
print("Max of Rouge2: ", max(rouge2))
print("Max of RougeL: ", max(rougeL))
print("Max of RougeLSum: ", max(rougeLsum))

print("Min of Rouge1: ", min(rouge1))
print("Min of Rouge2: ", min(rouge2))
print("Min of RougeL: ", min(rougeL))
print("Min of RougeLSum: ", min(rougeLsum))

rougeL.index(max(rougeL))
print("\nMax")
print(headlines[rougeL.index(max(rougeL))])
print(generated_summary[rougeL.index(max(rougeL))])
print(rouge_scores[rougeL.index(max(rougeL))])
print("\nMin")
print(headlines[rougeL.index(min(rougeL))])
print(generated_summary[rougeL.index(min(rougeL))])
print(rouge_scores[rougeL.index(min(rougeL))])

Variance of Rouge1:  0.017460567104681395
Variance of Rouge2:  0.006111782061118335
Variance of RougeL:  0.010037224319846973
Variance of RougeLSum:  0.010037224319846973
Median of Rouge1:  0.13118279569892474
Median of Rouge2:  0.0
Median of RougeL:  0.10128205128205128
Median of RougeLSum:  0.10128205128205128
Average of Rouge1:  0.15715018842609657
Average of Rouge2:  0.03720060232095358
Average of RougeL:  0.12159704343095655
Average of RougeLSum:  0.12159704343095655
Max of Rouge1:  0.6153846153846154
Max of Rouge2:  0.4166666666666667
Max of RougeL:  0.6153846153846154
Max of RougeLSum:  0.6153846153846154
Min of Rouge1:  0.0
Min of Rouge2:  0.0
Min of RougeL:  0.0
Min of RougeLSum:  0.0

Max
Trump seeks to revive Dakota Access, Keystone XL oil pipelines
the president signed executive orders to revive the controversial Dakota Access and Keystone XL oil pipelines
{'rouge1': 0.6153846153846154, 'rouge2': 0.4166666666666667, 'rougeL': 0.6153846153846154, 'rougeLsum': 0.6153846153846