<a href="https://colab.research.google.com/github/skbetz54/Samuel_DATA606/blob/main/Notebooks/Web_Scrape_Fox_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Python library installations that do not come with Colab

!pip install contractions # used to get rid of contractions (in real-world news articles)
!pip install textblob # used for quantifying polarity of the tweets in the dataset.
!pip install newspaper3k

# Library Imports

import os
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torchtext
import pickle
import contractions

#Preprocessing Functions
from sklearn.utils import shuffle
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt') # punkt is a dependency used for nltk's  function "word_tokenize" that does not come when you import nltk
nltk.download('stopwords') # NLTK has a built-in dictionary of stop words that will be used when removing stopwords from the data
nltk.download('averaged_perceptron_tagger') # package that assigns parts of speech, to be used in lemmatization
nltk.download('wordnet') # Added because the above import from corpus sometimes does not work.
from nltk.tag import pos_tag # Another package used in lemmatization to import parts of speech for words

from wordcloud import WordCloud # EDA Tool
from textblob import TextBlob

from google.colab import files, drive

from newspaper import Article
import newspaper

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
foxnews_opinions = 'https://www.foxnews.com/opinion'

opinion_foxnews = newspaper.build(foxnews_opinions, language='en', memoize_articles=False)

string1 = 'foxnews.com/opinion/'
links_foxnews_opinion = []

for article in opinion_foxnews.articles:
  if string1 in article.url:
    links_foxnews_opinion.append(article.url)


print(len(
    links_foxnews_opinion))

27


In [3]:
list_foxnews = list()

for link in links_foxnews_opinion:
  article = Article(link)
  article.download()
  article.parse()
  list_foxnews.append({"Title":article.title,
                   "Date":article.publish_date,
                   "URL": article.url,
                   "Text":article.text})
  

df_foxnews = pd.DataFrame.from_dict(list_foxnews)

df_foxnews.head()

Unnamed: 0,Title,Date,URL,Text
0,Star Trek writers take Starship Enterprise whe...,,https://www.foxnews.com/opinion/star-trek-star...,NEW You can now listen to Fox News articles!\n...
1,Homeland Security peddles disinformation about...,,https://www.foxnews.com/opinion/homeland-secur...,NEW You can now listen to Fox News articles!\n...
2,"How to build an American majority, not a Repub...",,https://www.foxnews.com/opinion/american-major...,NEW You can now listen to Fox News articles!\n...
3,Robert 'Bud' McFarlane was an unsung hero in A...,,https://www.foxnews.com/opinion/robert-bud-mcf...,NEW You can now listen to Fox News articles!\n...
4,Greg Gutfeld: Patti LuPone’s outburst is furth...,,https://www.foxnews.com/opinion/greg-gutfeld-p...,NEW You can now listen to Fox News articles!\n...


In [6]:
df_foxnews['Text'] = df_foxnews['Text'].str.lower()

df_foxnews['tokenized_text'] = df_foxnews.apply(lambda row: nltk.word_tokenize(row['Text']), axis = 1)

df_foxnews.head()

Unnamed: 0,Title,Date,URL,Text,tokenized_text,no_stopwords
0,Star Trek writers take Starship Enterprise whe...,,https://www.foxnews.com/opinion/star-trek-star...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[N, E, W, , Y, u, , c, n, , n, w, , l, e, ..."
1,Homeland Security peddles disinformation about...,,https://www.foxnews.com/opinion/homeland-secur...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[N, E, W, , Y, u, , c, n, , n, w, , l, e, ..."
2,"How to build an American majority, not a Repub...",,https://www.foxnews.com/opinion/american-major...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[N, E, W, , Y, u, , c, n, , n, w, , l, e, ..."
3,Robert 'Bud' McFarlane was an unsung hero in A...,,https://www.foxnews.com/opinion/robert-bud-mcf...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[N, E, W, , Y, u, , c, n, , n, w, , l, e, ..."
4,Greg Gutfeld: Patti LuPone’s outburst is furth...,,https://www.foxnews.com/opinion/greg-gutfeld-p...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[N, E, W, , Y, u, , c, n, , n, w, , l, e, ..."


In [13]:
# Stop Words

regex = re.compile('[!@#$%^&*()?!],.:;\"\'')

stop_words = set(stopwords.words('english')) # Declaring only english words as the ones that we want to use

# Because huggingface has already gotten rid of contractions, I need to add some common contractions WITHOUT the apostrophe so they are still recognized
# NOTE: This will likely not be required in real-world text since the incoming text will not be partially cleaned already.
cont_no_apostrophe = ['dont','id','im','ive','ill','shouldve','couldve','arent','its','itll','itd','thats','thatd','whod','youve','were',
                      'whod','whats','theyre','theres','thered', 'CNN','/n', 'fox','news','to', regex,"!","","'s","''"]
# Adding above words to the set
for word in cont_no_apostrophe:
  stop_words.add(word)


df_foxnews['no_stopwords'] = df_foxnews['tokenized_text'] # Copying our previous step into a new one

# Finally applying a function to remove all of the 
df_foxnews['no_stopwords'] = df_foxnews['no_stopwords'].apply(lambda x:[word for word in x if word not in stop_words])

df_foxnews.head()


Unnamed: 0,Title,Date,URL,Text,tokenized_text,no_stopwords
0,Star Trek writers take Starship Enterprise whe...,,https://www.foxnews.com/opinion/star-trek-star...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, quintessential, americ..."
1,Homeland Security peddles disinformation about...,,https://www.foxnews.com/opinion/homeland-secur...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, recently, ,, much, att..."
2,"How to build an American majority, not a Repub...",,https://www.foxnews.com/opinion/american-major...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, contract, america, bec..."
3,Robert 'Bud' McFarlane was an unsung hero in A...,,https://www.foxnews.com/opinion/robert-bud-mcf...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, robert, c., ``, bud, m..."
4,Greg Gutfeld: Patti LuPone’s outburst is furth...,,https://www.foxnews.com/opinion/greg-gutfeld-p...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, let, talk, cranky, jac..."


In [15]:
df_foxnews['part_of_speech'] = df_foxnews['no_stopwords'].apply(nltk.tag.pos_tag) # Using nltk's built in "pos_tag" to add parts of speech for each token (word)

df_foxnews.head()

Unnamed: 0,Title,Date,URL,Text,tokenized_text,no_stopwords,part_of_speech
0,Star Trek writers take Starship Enterprise whe...,,https://www.foxnews.com/opinion/star-trek-star...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, quintessential, americ...","[(new, JJ), (listen, JJ), (articles, VBZ), (qu..."
1,Homeland Security peddles disinformation about...,,https://www.foxnews.com/opinion/homeland-secur...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, recently, ,, much, att...","[(new, JJ), (listen, JJ), (articles, NNS), (re..."
2,"How to build an American majority, not a Repub...",,https://www.foxnews.com/opinion/american-major...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, contract, america, bec...","[(new, JJ), (listen, NN), (articles, NNS), (co..."
3,Robert 'Bud' McFarlane was an unsung hero in A...,,https://www.foxnews.com/opinion/robert-bud-mcf...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, robert, c., ``, bud, m...","[(new, JJ), (listen, JJ), (articles, NNS), (ro..."
4,Greg Gutfeld: Patti LuPone’s outburst is furth...,,https://www.foxnews.com/opinion/greg-gutfeld-p...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, let, talk, cranky, jac...","[(new, JJ), (listen, JJ), (articles, NNS), (le..."


In [16]:
"""
--------------------------------------
As we see above, the "part_of_speech" column now contains tuples with both the word and the part of speech. 
However, for our lemmatizer we need to transform these parts of speech into things that nltk's wordnet can recognize.
Therefore I create a function 'wordnet_tagger' (taken from https://www.holisticseo.digital/python-seo/nltk/lemmatize)
to allow this to happen.
--------------------------------------
"""
#Helper function to transform the parts of speech into 
def wordnet_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return wordnet.NOUN # This part was changed from 'return NONE' because there are several other tags found here not in the above categories, which will be changed to nouns

# Creating new column for the wordnet part of speech to use, applying the above function
df_foxnews['wn_part_of_speech'] = df_foxnews['part_of_speech'].apply(lambda x: [(word, wordnet_tagger(pos_tag)) for (word, pos_tag) in x])

df_foxnews.head()

Unnamed: 0,Title,Date,URL,Text,tokenized_text,no_stopwords,part_of_speech,wn_part_of_speech
0,Star Trek writers take Starship Enterprise whe...,,https://www.foxnews.com/opinion/star-trek-star...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, quintessential, americ...","[(new, JJ), (listen, JJ), (articles, VBZ), (qu...","[(new, a), (listen, a), (articles, v), (quinte..."
1,Homeland Security peddles disinformation about...,,https://www.foxnews.com/opinion/homeland-secur...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, recently, ,, much, att...","[(new, JJ), (listen, JJ), (articles, NNS), (re...","[(new, a), (listen, a), (articles, n), (recent..."
2,"How to build an American majority, not a Repub...",,https://www.foxnews.com/opinion/american-major...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, contract, america, bec...","[(new, JJ), (listen, NN), (articles, NNS), (co...","[(new, a), (listen, n), (articles, n), (contra..."
3,Robert 'Bud' McFarlane was an unsung hero in A...,,https://www.foxnews.com/opinion/robert-bud-mcf...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, robert, c., ``, bud, m...","[(new, JJ), (listen, JJ), (articles, NNS), (ro...","[(new, a), (listen, a), (articles, n), (robert..."
4,Greg Gutfeld: Patti LuPone’s outburst is furth...,,https://www.foxnews.com/opinion/greg-gutfeld-p...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, let, talk, cranky, jac...","[(new, JJ), (listen, JJ), (articles, NNS), (le...","[(new, a), (listen, a), (articles, n), (let, v..."


In [19]:
"""
--------------------------------------
Finally we can apply the parts of speech and words to our word lemmatizer using 
NLTK's WordNetLemmatizer again by applying the 'wn_part_of_speech' tuples with my lemmatizer
--------------------------------------
"""

lemmatizer = WordNetLemmatizer()

df_foxnews['lemmatized'] = df_foxnews['wn_part_of_speech'].apply(lambda x: [lemmatizer.lemmatize(word, pos) for word, pos in x])

df_foxnews['lemma_string'] = [' '.join(map(str,l)) for l in df_foxnews['lemmatized']]

df_foxnews.head()

Unnamed: 0,Title,Date,URL,Text,tokenized_text,no_stopwords,part_of_speech,wn_part_of_speech,lemmatized,lemma_string
0,Star Trek writers take Starship Enterprise whe...,,https://www.foxnews.com/opinion/star-trek-star...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, quintessential, americ...","[(new, JJ), (listen, JJ), (articles, VBZ), (qu...","[(new, a), (listen, a), (articles, v), (quinte...","[new, listen, article, quintessential, america...",new listen article quintessential american sto...
1,Homeland Security peddles disinformation about...,,https://www.foxnews.com/opinion/homeland-secur...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, recently, ,, much, att...","[(new, JJ), (listen, JJ), (articles, NNS), (re...","[(new, a), (listen, a), (articles, n), (recent...","[new, listen, article, recently, ,, much, atte...","new listen article recently , much attention f..."
2,"How to build an American majority, not a Repub...",,https://www.foxnews.com/opinion/american-major...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, contract, america, bec...","[(new, JJ), (listen, NN), (articles, NNS), (co...","[(new, a), (listen, n), (articles, n), (contra...","[new, listen, article, contract, america, beco...",new listen article contract america become fam...
3,Robert 'Bud' McFarlane was an unsung hero in A...,,https://www.foxnews.com/opinion/robert-bud-mcf...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, robert, c., ``, bud, m...","[(new, JJ), (listen, JJ), (articles, NNS), (ro...","[(new, a), (listen, a), (articles, n), (robert...","[new, listen, article, robert, c., ``, bud, mc...",new listen article robert c. `` bud mcfarlane ...
4,Greg Gutfeld: Patti LuPone’s outburst is furth...,,https://www.foxnews.com/opinion/greg-gutfeld-p...,new you can now listen to fox news articles!\n...,"[new, you, can, now, listen, to, fox, news, ar...","[new, listen, articles, let, talk, cranky, jac...","[(new, JJ), (listen, JJ), (articles, NNS), (le...","[(new, a), (listen, a), (articles, n), (let, v...","[new, listen, article, let, talk, cranky, jack...",new listen article let talk cranky jackass scr...
