# Data Loading, Cleaning and Wrangling

In [1]:
#Need to run this to have the right path
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#Installing the right stuff
!pip install -q sentencepiece
!pip install -q transformers
!pip install -q evaluate
!pip install -q rouge_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [3]:
#Importing necessary libraries for cleaning
import pandas as pd
import numpy as np
import evaluate
from pprint import pprint

In [4]:
#Building path and opening the files to create the dataframe
data_path = "/content/drive/MyDrive/266 NLP - Final Project/all_the_news_SUBSET_Kaggle/"

df1 = pd.read_csv(data_path + "articles1.csv").iloc[:, 1:]
df2 = pd.read_csv(data_path + "articles2.csv").iloc[:, 1:]
df3 = pd.read_csv(data_path + "articles3.csv").iloc[:, 1:]

data = pd.concat([df1, df2, df3])

In [5]:
### Data Characteristics
## Each id is unique to each row/headline with no duplicate articles/rows
## 142,570 total headline and articles

## 1.85% of the data doesn't have dates but I feel like it's ok to get rid of them since it's such a small sample
data[data["date"].isnull()].shape[0]/data.shape[0]
data = data[~data["date"].isnull()]

## Out of all the articles, 99.7% of them are from 2015-2017 so it might not be helpful to keep the older articles before 2015 since there are so few
data.value_counts("year").sort_index(ascending=False)
data = data[data["year"]>2014]

data = data.reset_index()

#Added word count column
data['article_wd_ct'] = data['content'].str.split().str.len() ## word count of each article

## Only subsetting for article that have at least 100 words- could do more analysis into including ones between 70-100 but the ones I've seen in that range aren't fully populated articles or poorly extracted articles
data = data[data["article_wd_ct"]>=100]

## Further subsetting the data for articles below 500 words for ease of running the models
data = data[data["article_wd_ct"]<=500]

Data Cleaning for Headlines.

A manual process done publication by publication

In [6]:
#Atlantic
data = data.drop(data.loc[(data.publication=="Atlantic") & (data.title.str.contains("Atlantic"))].index)

#Breitbart
data.loc[((data.publication=="Breitbart") & (data.title.str.contains("- Breitbart"))), "title"] = data[(data["publication"] == "Breitbart") & (data["title"].str.contains("- Breitbart"))].title.str[:-12]
data = data.drop(data.loc[(data.publication=="Breitbart") & ( (data.title.str.contains("Breitbart")) | (data.title.str.contains("BREITBART")) )].index)

#Business Insider
data = data.drop(data.loc[(data.publication=="Business Insider") & ((data.title.str.contains("Business Insider")) | (data.title.str.contains("BUSINESS INSIDER")) )].index)

#BuzzFeed
data = data.drop(data.loc[(data.publication=="Buzzfeed News") & ((data.title.str.contains("BuzzFeed")) | (data.title.str.contains("BUZZFEED")) )].index)

#CNN
data = data.drop(data.loc[(data.publication=="CNN") & (data.title.str.contains("CNN"))].index)

#Fox News
#Fix all the aforementioned appendings. Data frame should be equal size to above, 53089, can use that to validate.
data.loc[((data.publication=="Fox News") & (data.title.str.contains("Fox News Poll:"))), "title"] = data[(data["publication"] == "Fox News") & (data["title"].str.contains("Fox News Poll:"))].title.str[15:]
data.loc[((data.publication=="Fox News") & (data.title.str.contains("Fox News Electoral Scorecard:"))), "title"] = data[(data["publication"] == "Fox News") & (data["title"].str.contains("Fox News Electoral Scorecard:"))].title.str[30:]
data.loc[((data.publication=="Fox News") & (data.title.str.contains("Fox News projects:"))), "title"] = data[(data["publication"] == "Fox News") & (data["title"].str.contains("Fox News projects:"))].title.str[19:]
data = data.drop(data.loc[(data.publication=="Fox News") & ((data.title.str.contains("Fox")) | (data.title.str.contains("FOX")) )].index)

#Guardian
data = data.drop(data.loc[(data.publication=="Guardian") & (data.title.str.contains("Guardian"))].index)

#National Review
data = data.drop(data.loc[(data.publication=="National Review") & ( (data.title.str.contains("NR ")) | (data.title.str.contains("NR:")) | (data.title.str.contains("NRI")) | (data.title.str.contains("NRO")) )].index)

#New York Post
data = data.drop(data.loc[(data.publication=="New York Post") & (data.title.str.contains("New York Post"))].index)

#New York Times
data.loc[data.publication=="New York Times", "title"] = data[data['publication'] == "New York Times"].title.str[:-21]
data = data.drop(data.loc[(data.publication=="New York Times") & (data.title.str.contains("New York Times"))].index)

#NPR
data = data.drop(data.loc[(data.publication=="NPR") & (data.title.str.contains("NPR"))].index)

#Reuters
data = data.drop(data.loc[(data.publication=="Reuters") & (data.title.str.contains("Reuters"))].index)

#Talking Points Memo
data = data.drop(data.loc[(data.publication=="Talking Points Memo") & (data.title.str.contains("TPM"))].index)

#Vox
data = data.drop(data.loc[(data.publication=="Vox") & (data.title.str.contains("Vox"))].index)

#Washington Post
data = data.drop(data.loc[(data.publication=="Washington Post") & (data.title.str.contains("Washington Post"))].index)

In [7]:
#Totals got iffy, but it's at 52958
data.reset_index()

#data

#Do not uncomment unless you want to download
#data.to_excel("CLEANED compact.xlsx")

Unnamed: 0,level_0,index,id,title,publication,author,date,year,month,url,content,article_wd_ct
0,5,5,17288,"Sick With a Cold, Queen Elizabeth Misses New Y...",New York Times,Sewell Chan,2017-01-02,2017.0,1.0,,"LONDON — Queen Elizabeth II, who has been b...",159
1,19,19,17303,Fecal Pollution Taints Water at Melbourne’s Be...,New York Times,Brett Cole,2017-01-03,2017.0,1.0,,"SYDNEY, Australia — The annual beach pilgri...",292
2,33,33,17323,"Airline Pilot, Believed to Be Drunk, Is Pulled...",New York Times,Ian Austen,2017-01-03,2017.0,1.0,,OTTAWA — It was 7 a. m. and 99 passengers a...,410
3,47,47,17338,It’s Time to Ignore Advice About Which Stocks ...,New York Times,Damon Darlin,2017-01-09,2017.0,1.0,,It’s that time of year when financial advice s...,395
4,52,52,17344,Chinese City Official Shoots 2 Others and Kill...,New York Times,Chris Buckley and Adam Wu,2017-01-05,2017.0,1.0,,BEIJING — A city official in southwest Chin...,358
...,...,...,...,...,...,...,...,...,...,...,...,...
52953,139478,42535,218042,Debbie Reynolds was a trouper all the way to t...,Washington Post,Ann Hornaday,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161230000842/htt...,Y ou can almost hear Carrie Fishe...,367
52954,139487,42544,218051,An obscure E.U. regulation may have saved live...,Washington Post,Adam Taylor,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161230000842/htt...,The truck attack on a Christmas market in...,387
52955,139492,42549,218056,A woman ordered canaries to brighten her home....,Washington Post,Avi Selk,2016-12-29,2016.0,12.0,https://web.archive.org/web/20161230000842/htt...,"Somewhere between Texas and Alabama, some...",373
52956,139494,42551,218063,"Trump praises Putin’s response to sanctions, c...",Washington Post,Karoun Demirjian,2016-12-30,2016.0,12.0,https://web.archive.org/web/20161231004909/htt...,Donald Trump on Friday expressed his ap...,314


# T5 for Generic Summarization

In [8]:
#A lot of the stuff got imported from the libraries up top.
#If anything else needs to be imported ill do so here
#Importing the t5model and the t5 tokenizer
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

t5model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
t5tokenizer = T5Tokenizer.from_pretrained("t5-base")

#Rouge evaluator
rouge = evaluate.load('rouge')

#These are the current number of publications
data.publication.value_counts()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Breitbart              14959
New York Post          11427
Business Insider        4047
CNN                     4042
Reuters                 3192
NPR                     3052
Fox News                2222
Buzzfeed News           2076
Guardian                1901
Talking Points Memo     1717
Washington Post         1362
Atlantic                1025
Vox                      742
New York Times           645
National Review          549
Name: publication, dtype: int64

In [9]:
publications_unique = data.publication.unique()
publications_unique

array(['New York Times', 'Breitbart', 'CNN', 'Business Insider',
       'Atlantic', 'Fox News', 'Talking Points Memo', 'Buzzfeed News',
       'National Review', 'New York Post', 'Guardian', 'NPR', 'Reuters',
       'Vox', 'Washington Post'], dtype=object)

In [10]:
#This is where we will store the headlines
headlines_rand = []

#This is where we will store the stories
articles_rand = []

#Just to check, publication. Should all be 75 in size
publication_rand= []

#Getting a random sample of 5 from every publication.
for i in publications_unique:

  #Createing a data frame from the current publication, and sampling 5.
  current_df =  data[data['publication'] == i].sample(5)
  current_df = current_df[["title", "publication", "content"]]

  for row in range(5):
    headlines_rand.append(current_df.iloc[row].title)
    articles_rand.append(current_df.iloc[row].content)
    publication_rand.append(current_df.iloc[row].publication)

^^ DO NOT RUN

In [51]:
#This is where we will store the summaries
generated_summary = []

#Rouge results
rogue_results = []

In [52]:
#Generate summaries
#T5 is trained on several tasks, not just summarization. So we prepend the input with this to let the model know we want to summarie
PROMPT = 'summarize: '
#Token range parameter. How much bigger (or smaller, but usually bigger) can the summary be? 5 seems to be a good number. probably won't change this.
token_range = 10
#Number of beams. How "deep" does the model look at the next word (token I think) before generating. 4 seems to be good and stable. Headlines are not that big.
number_of_beams_parameter = 5
#No repeat n gram.
no_repeat_ngram_size_parameter = 4

for i in range(75):
  ARTICLE_TO_SUMMARIZE = articles_rand[i]
  REFERENCE = headlines_rand[i]
  T5ARTICLE_TO_SUMMARIZE = PROMPT + ARTICLE_TO_SUMMARIZE

  #Tokenizing the input, aka prepping to put it into BERT
  inputs = t5tokenizer(T5ARTICLE_TO_SUMMARIZE, max_length=1024, truncation=True, return_tensors="tf")

  #Generating summary
  summary_ids = t5model.generate(inputs["input_ids"],
                                 num_beams=number_of_beams_parameter,
                                 max_length=len(t5tokenizer(REFERENCE)['input_ids']) + token_range,
                                 min_length=len(t5tokenizer(REFERENCE)['input_ids']) - token_range,
                                 no_repeat_ngram_size=no_repeat_ngram_size_parameter
                                 )
  #Decoding
  candidate = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

  predictions = candidate
  references = [REFERENCE]
  results = rouge.compute(predictions=predictions,
                          references=references)
  rogue_results.append(results)

  print(str(i), " done")


0  done
1  done
2  done
3  done
4  done
5  done
6  done
7  done
8  done
9  done
10  done
11  done
12  done
13  done
14  done
15  done
16  done
17  done
18  done
19  done
20  done
21  done
22  done
23  done
24  done
25  done
26  done
27  done
28  done
29  done
30  done
31  done
32  done
33  done
34  done
35  done
36  done
37  done
38  done
39  done
40  done
41  done
42  done
43  done
44  done
45  done
46  done
47  done
48  done
49  done
50  done
51  done
52  done
53  done
54  done
55  done
56  done
57  done
58  done
59  done
60  done
61  done
62  done
63  done
64  done
65  done
66  done
67  done
68  done
69  done
70  done
71  done
72  done
73  done
74  done


In [53]:
import statistics
rouge1 = []
rouge2 = []
rougeL = []
rougeLsum = []

In [54]:
for i in rogue_results:
  rouge1.append(i['rouge1'])
  rouge2.append(i['rouge2'])
  rougeL.append(i['rougeL'])
  rougeLsum.append(i['rougeLsum'])

In [55]:
#rogue_results

In [56]:
print("Variance of Rouge1: ", np.var(rouge1))
print("Variance of Rouge2: ", np.var(rouge2))
print("Variance of RougeL: ", np.var(rougeL))
print("Variance of RougeLSum: ", np.var(rougeLsum))

print("Median of Rouge1: ", statistics.median(rouge1))
print("Median of Rouge2: ", statistics.median(rouge2))
print("Median of RougeL: ", statistics.median(rougeL))
print("Median of RougeLSum: ", statistics.median(rougeLsum))

print("Average of Rouge1: ", sum(rouge1)/len(rouge1))
print("Average of Rouge2: ", sum(rouge2)/len(rouge2))
print("Average of RougeL: ", sum(rougeL)/len(rougeL))
print("Average of RougeLSum: ", sum(rougeLsum)/len(rougeLsum))

print("Max of Rouge1: ", max(rouge1))
print("Max of Rouge2: ", max(rouge2))
print("Max of RougeL: ", max(rougeL))
print("Max of RougeLSum: ", max(rougeLsum))

print("Min of Rouge1: ", min(rouge1))
print("Min of Rouge2: ", min(rouge2))
print("Min of RougeL: ", min(rougeL))
print("Min of RougeLSum: ", min(rougeLsum))

rougeL.index(max(rougeL))

Variance of Rouge1:  0.015098128493945859
Variance of Rouge2:  0.005730592838088815
Variance of RougeL:  0.010826490558594479
Variance of RougeLSum:  0.010826490558594479
Median of Rouge1:  0.14814814814814817
Median of Rouge2:  0.0
Median of RougeL:  0.13333333333333333
Median of RougeLSum:  0.13333333333333333
Average of Rouge1:  0.1832051843952944
Average of Rouge2:  0.04992303512332332
Average of RougeL:  0.15170849578783552
Average of RougeLSum:  0.15170849578783552
Max of Rouge1:  0.5185185185185185
Max of Rouge2:  0.29629629629629634
Max of RougeL:  0.3846153846153846
Max of RougeLSum:  0.3846153846153846
Min of Rouge1:  0.0
Min of Rouge2:  0.0
Min of RougeL:  0.0
Min of RougeLSum:  0.0


26

In [50]:
headlines_rand

['Donald Trump Takes Credit for Helping to Save a Ford Plant That Wasn’t Closing',
 '‘Ring of Fire’ Eclipse Travels Across South America and Africa',
 'Bruce Springsteen’s Archive Is Headed to Monmouth University',
 'California Official Says Trump’s Claim of Voter Fraud Is ‘Absurd’',
 'Monkey in Kenya Survives After Setting Off Nationwide Blackout',
 'Hillary Clinton Warns of ‘Conspiracy Theory Machine Factory’',
 'Report: GOP Differences on Healthcare ’Have Narrowed,’ Healthcare Reform ’Very Much Alive’',
 'Donald Trump Appoints Top Deputies, More White House Staff',
 'LISTEN: Green Day Single Focuses on American ’Culture of Mass Shooting’',
 'Five Facts About Donald Trump’s Expected Pick for Interior Secretary, Rep. Cathy McMorris Rodgers',
 'George W. Bush to attend fundraiser in support of Gillespie',
 'Justin Timberlake and Jessica Biel expecting first baby',
 'Bill Cosby drops lawsuit against accuser',
 'Lupita Nyong’o’s dress possibly recovered',
 'New Trump accuser: GOP nominee

In [80]:
print(headlines_rand[11])

#Tokenizing the input, aka prepping to put it into BERT
inputs = t5tokenizer("summarize: " + articles_rand[11], max_length=1024, truncation=True, return_tensors="tf")

#Generating summary
summary_ids = t5model.generate(inputs["input_ids"],
                               num_beams=number_of_beams_parameter,
                               max_length=len(t5tokenizer(REFERENCE)['input_ids']) + token_range,
                               min_length=len(t5tokenizer(REFERENCE)['input_ids']) - token_range,
                               no_repeat_ngram_size=no_repeat_ngram_size_parameter
                               )
#Decoding
candidate = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(candidate[0])

Justin Timberlake and Jessica Biel expecting first baby
Justin Timberlake has confirmed that his wife, Jessica Biel, is expecting . the singer said they don't know the sex of the baby yet


Below is old, use only as reference.

In [None]:
#This is a trial run.
#I want to get a n=5 sample of all different publications (first I'm doing 3 publications), and then running T5 on them to measure variance. The samples should be the same.

#T5 is trained on several tasks, not just summarization. So we prepend the input with this to let the model know we want to summarie
PROMPT = 'summarize: '

#Token range parameter. How much bigger (or smaller, but usually bigger) can the summary be? 5 seems to be a good number. probably won't change this.
token_range = 5

#Number of beams. How "deep" does the model look at the next word (token I think) before generating. 4 seems to be good and stable. Headlines are not that big.
number_of_beams_parameter = 4

#No repeat n gram.
no_repeat_ngram_size_parameter = 3

#This is where the results will be stored
rogue_results = []

for i in publications_unique:

  print("Publication: ", str(i))

  #Createing a data frame from the current publication, and sampling 5.
  current_df =  data[data['publication'] == i].sample(5)
  current_df = current_df[["title", "publication", "content"]]

  #print(current_df)

  for article in range(5):
    #The article to summarize will be the content
    ARTICLE_TO_SUMMARIZE = current_df.iloc[article].content
    REFERENCE = current_df.iloc[article].title
    #pprint(ARTICLE_TO_SUMMARIZE)
    T5ARTICLE_TO_SUMMARIZE = PROMPT + ARTICLE_TO_SUMMARIZE
    print("REFERENCE: ", REFERENCE)

    #Tokenizing the input, aka prepping to put it into BERT
    inputs = t5tokenizer(T5ARTICLE_TO_SUMMARIZE, max_length=1024, truncation=True, return_tensors="tf")

    #Generating summary
    #summary_ids = t5model.generate(inputs["input_ids"],
    #                               num_beams=number_of_beams_parameter,
    #                               max_length=len(t5tokenizer(REFERENCE)['input_ids']) + token_range,
    #                               min_length=len(t5tokenizer(REFERENCE)['input_ids']) - token_range,
    #                               no_repeat_ngram_size=no_repeat_ngram_size_parameter
    #                               )

    #Decoding
    #candidate = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

    print("------")
    #pprint(REFERENCE)
    #pprint(candidate[0], compact=True)

    #predictions = candidate
    #references = [REFERENCE]
    #results = rouge.compute(predictions=predictions,
    #                        references=references)

    #print(results)
    #rogue_results.append(results)
    print("----------------")

  #print(data[data['publication'] == i].iloc[0:5])

Publication:  New York Times
REFERENCE:  Beyoncé Is Pregnant, and Twitter Loses It
------
----------------
REFERENCE:  Keith Lamont Scott Was Killed by Two Gunshot Wounds, Family Autopsy Finds
------
----------------
REFERENCE:  Clinton to Trump on Twitter: ‘Delete Your Account’
------
----------------
REFERENCE:  10 Key Moments From Thursday’s Trump Nominee Hearings
------
----------------
REFERENCE:  Suspect in Istanbul Rampage on New Year’s Is Captured, Turkey Says
------
----------------
Publication:  Breitbart
REFERENCE:  David Axelrod: Hillary Doesn’t Have a Core Message
------
----------------
REFERENCE:  Marco Rubio Wants to Keep His Delegates at Republican Convention
------
----------------
REFERENCE:  Texas Black Lives Matter Protesters Called for Killing of White Cops in 2015
------
----------------
REFERENCE:  LaSalvia: Matt Drudge’s Army Bigger, Badder Than RNC’s
------
----------------
REFERENCE:  The Washington Post: Should the Democrats Be Panicking?
------
------------

In [None]:
rouge_results