In [110]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import pickle
import argparse
import calendar
import datetime
from os import path

def get_posts(subreddit):
    timestamp = calendar.timegm(datetime.datetime.now().utctimetuple())
    timestamp = calendar.timegm(datetime.datetime(2020, 9, 11).utctimetuple())
    first = calendar.timegm(datetime.datetime(2020, 9, 10).utctimetuple())
    posts = []

    while timestamp > first:
        url = ("https://api.pushshift.io/reddit/search/submission/"
            "?subreddit={}&sort=desc&sort_type=created_utc&"
            "before={}&size=1000").format(subreddit,timestamp)
        r = requests.get(url)
        if 'json' in r.headers.get('Content-Type'):
            r = r.json()
            posts += r['data']
            print("Added posts from {} to {}".format(datetime.datetime.fromtimestamp(r['data'][-1]['created_utc']),
                                                     datetime.datetime.fromtimestamp(r['data'][0]['created_utc'])))
            timestamp = r['data'][-1]['created_utc']
            

    pkl.dump(posts, open("posts-{}.pkl".format(subreddit), "wb+"))
    


In [133]:
from sklearn.feature_extraction.text import CountVectorizer
import re
from sklearn.feature_extraction.text import TfidfTransformer
import random 
import time
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
def get_scores(subreddit):
    posts_list = pd.read_pickle("posts-{}.pkl".format(subreddit))
    df_posts = pd.DataFrame(posts_list)
    df_posts['text'] = df_posts['title'] + ' ' + df_posts['selftext']
    df_posts['used_id'] = df_posts['id']
    threads = {}
    for index, x in df_posts.iterrows():
            threads[str(x.used_id)] = str(x.title) + ' ' + str(x.selftext)
            

    stop_words = set(stopwords.words('english'))
    new_stop_words = set(['removed','x200b', 'amp'])
    stop_words = stop_words.union(new_stop_words)
    corpus = []
    corpus_w_key = {}
    tokenizer = RegexpTokenizer(r'\w+')
    for key in threads:
        x = threads[key]
        text = str(x.lower())
        text = tokenizer.tokenize(re.sub(r'https?://\S+', '', text))
        ps=PorterStemmer()
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text if not word in  
                stop_words] 
        text = " ".join(text)
        corpus.append(text)
        corpus_w_key[key] = text

        
    cv = CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
    X = cv.fit_transform(corpus)
    tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(X)
    feature_names=cv.get_feature_names()
    df_posts['type'] = 'post'
    
    stop_words = set(stopwords.words('english'))
    new_stop_words = set(['removed','x200b', 'amp', 'hi', 'like', 'get'])
    stop_words = stop_words.union(new_stop_words)
    corpus = []
    tokenizer = RegexpTokenizer(r'\w+')
    df_posts = df_posts[df_posts['author'] != 'AutoModerator']

    for x in df_posts['text']:
        text = str(x).lower()
        text = tokenizer.tokenize(re.sub(r'https?://\S+', '', text))
        ps=PorterStemmer()
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text if not word in  
                stop_words] 
        text = " ".join(text)
        corpus.append(text)
        
    cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
    X=cv.fit_transform(corpus)
    
    nltk.download('vader_lexicon')
    # Instantiate the sentiment intensity analyzer with the existing lexicon
    vader = SentimentIntensityAnalyzer()
    random.seed(time.time())


    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(X)
    # get feature names
    feature_names=cv.get_feature_names()
    count = 0
    results = []
    for x in corpus:
        # fetch document for which keywords needs to be extracted
        doc=x

        #generate tf-idf for the given document
        tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))
        #Function for sorting tf_idf in descending order
        from scipy.sparse import coo_matrix
        def sort_coo(coo_matrix):
            tuples = zip(coo_matrix.col, coo_matrix.data)
            return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

        def extract_topn_from_vector(feature_names, sorted_items, topn=10):
            """get the feature names and tf-idf score of top n items"""

            #use only topn items from vector
            sorted_items = sorted_items[:topn]

            score_vals = []
            feature_vals = []

            # word index and corresponding tf-idf score
            for idx, score in sorted_items:

                #keep track of feature name and its corresponding score
                score_vals.append(round(score, 3))
                feature_vals.append(feature_names[idx])

            #create a tuples of feature,score
            #results = zip(feature_vals,score_vals)
            results= {}
            for idx in range(len(feature_vals)):
                results[feature_vals[idx]]=score_vals[idx]

            return results
        
        #sort the tf-idf vectors by descending order of scores
        sorted_items=sort_coo(tf_idf_vector.tocoo())
        #extract only the top n; n here is 10
        keywords=extract_topn_from_vector(feature_names,sorted_items,15)
        
        # now print the results
        results.append(x)

    print(len(results))
    
    plt.style.use("fivethirtyeight")
    %matplotlib inline
    results_sentiment= []
    index = 0;
    for x in results:
#         print(x)
#         print("Sentiment Score: ", vader.polarity_scores(x)['compound'])
#         print("")
        results_sentiment.append((df_posts['title'][index],vader.polarity_scores(x)['compound']))
        index=index+1
    print(results_sentiment)
    pkl.dump(results_sentiment, open("results-{}.pkl".format(subreddit), "wb+"))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/aveekd/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


1200


In [135]:
get_posts('news')
get_scores('news')

Added posts from 2020-09-10 19:07:27 to 2020-09-10 19:59:59
Added posts from 2020-09-10 18:03:34 to 2020-09-10 19:07:00
Added posts from 2020-09-10 17:01:02 to 2020-09-10 18:03:33
Added posts from 2020-09-10 16:04:17 to 2020-09-10 17:00:39
Added posts from 2020-09-10 15:23:45 to 2020-09-10 16:04:13
Added posts from 2020-09-10 14:37:19 to 2020-09-10 15:22:20
Added posts from 2020-09-10 13:40:42 to 2020-09-10 14:37:17
Added posts from 2020-09-10 13:04:35 to 2020-09-10 13:40:30
Added posts from 2020-09-10 12:23:27 to 2020-09-10 13:04:31
Added posts from 2020-09-10 11:37:14 to 2020-09-10 12:23:09
Added posts from 2020-09-10 10:44:56 to 2020-09-10 11:37:13
Added posts from 2020-09-10 10:10:52 to 2020-09-10 10:44:42
Added posts from 2020-09-10 09:25:33 to 2020-09-10 10:10:48
Added posts from 2020-09-10 08:42:30 to 2020-09-10 09:25:31
Added posts from 2020-09-10 08:04:14 to 2020-09-10 08:42:29
Added posts from 2020-09-10 07:23:32 to 2020-09-10 08:04:12
Added posts from 2020-09-10 06:35:37 to 

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/aveekd/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


3000


In [136]:
get_posts('government')
get_scores('government')

Added posts from 2020-07-07 16:09:03 to 2020-09-08 21:06:46
100
[('The DOJ Files an Antitrust Case Against Google Monopoly; The Fight Against Big Tech, Explained', -0.3818), ('Silenced reality', 0.0), ('Union Minister Ravi Shankar Prasad: India Emerging as Big Manufacturing Centre', 0.0), ('What is the process of changing your legal name and how do you file for a new SSN after the name change?', 0.128), ('Recommendations for reading or resources on city government purchasing\\rfp process\\procurement', 0.0), ('what places are vigilantes most effective?', 0.4767), ('[QUESTION] Difference Between Executive &amp; LEA', 0.0), ('Trump EPA’s Rollback of Toxic Waste Environmental Regulations, Explained', -0.4215), ('Freshmen Government', 0.0), ('Attention Indian Government! Practice What You Preach on COVID-19 Safety', 0.4215), ('Tucker: Joe Biden suggests more violence ahead if Trump wins', -0.0772), ('Kitni baar bola hai ki padhne likhne wale chhatron ko rajniti se dur rahana chahiye', 0.0)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/aveekd/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [139]:
get_posts('conservative')
get_scores('conservative')

Added posts from 2020-09-10 17:29:05 to 2020-09-10 19:59:53
Added posts from 2020-09-10 14:41:28 to 2020-09-10 17:20:12
Added posts from 2020-09-10 12:07:43 to 2020-09-10 14:40:32
Added posts from 2020-09-10 09:23:53 to 2020-09-10 12:06:58
Added posts from 2020-09-10 05:42:34 to 2020-09-10 09:21:06
Added posts from 2020-09-09 22:30:00 to 2020-09-10 05:30:47
Added posts from 2020-09-09 18:46:28 to 2020-09-09 22:29:52


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/aveekd/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


700


In [140]:
get_posts('democrats')
get_scores('democrats')

Added posts from 2020-09-09 19:41:59 to 2020-09-10 19:52:06
100
[('Trump Saw His Generals As “Pussies.” They Saw Him As Completely Unfit', 0.0), ('Was banned for posting photos of Mitch McConnell in front of a confederate flag 👍👍🤷🤣', -0.4588), ('Need advice on how to talk to my Trump-supporting family member', 0.4404), ('Trump is not sympathetic to the struggles working class people face, so Emily is voting for Biden.', 0.25), ('Miles Taylor Killing It Again!!!', -0.6597), ('Accrual workplace', 0.0), ('Trump Lied About Corona - This Trump Lie, Killed Thousands - What Else Has He Lied About?', -0.8658), ('Joe Biden Stumbles, Makes No Sense', 0.0), ('When did a Darlene Snell move to Michigan and start signing? #Ozark', 0.0), ('No walk today.', 0.0), ('Tangkasnet &amp;amp; 88Tangkas - Bola Tangkas Online', 0.0), ('Last good Republican president?', 0.4404), ('Here are the fourteen Senate seats most likely to flip — Twelve are held by Republicans', 0.0), ('Feds are stiffing FDNY 9/11 health

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/aveekd/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [145]:
get_posts('Libertarian')
get_scores('Libertarian')

Added posts from 2020-09-10 10:32:29 to 2020-09-10 19:51:05
Added posts from 2020-09-09 15:06:40 to 2020-09-10 10:31:18


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/aveekd/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


200
[('A Nebraska County Took His $25,000 Property To Settle a $986 Tax Debt. Now the U.S. Supreme Court Could Get Involved.', 0.2732), ('SCOTUS Contender James Ho Combines Respect for Free Speech and Gun Rights With a Troubling Deference to Cops', 0.25), ('So what do you golds think about the nfl?', -0.2551), ('DOJ records show members of Government ‘wiped’ phones', 0.0), ("It's big brain time", 0.0), ('Personal rights are at odds with privatization', 0.5509), ('Google Promoting DuckDuckGo???', 0.3612), ('Environmentally Friendly Libertarian', 0.6369), ('Trust fund heir admits to watching TV for hours a day between golf trips, all on taxpayer dime', 0.6705), ('Why I support abortion legally while disliking it personally', 0.9226), ("Biden's Lies About Positions, Media Runs Cover", 0.0), ('COVID-19, weed: 4 things to know about the Libertarian party’s Indiana governor hopeful Donald Rainwater | Webb', 0.7845), ('Governments Never Give Up Power Voluntarily | Ludwig von Mises', 0.0), ('P

In [2]:
import requests
import json


def func(url):
    entries = []
    with requests.Session() as req:
        for item in range(1, 1000, 100):
            r = req.get(url.format(item)).json()
            for a in r['result']:
                    entries.append(a['headline'])
    return entries

data_cnn = func("https://search.api.cnn.io/content?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100&from={}")

In [11]:
import json
import bz2
import nltk
import os
import pymongo

# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

client = pymongo.MongoClient("mongodb+srv://admin:1234@cluster0.wcn2l.mongodb.net/PenApps2020?retryWrites=true&w=majority")
db = client['PennApps2020']
cnn_sentiment = db['cnn_sentiment']

politicians = ['Biden', 'Trump', 'Kamala Harris', 'Mike Pence']
cnn = []
for line in data_cnn:
  if (any(ele in line for ele in politicians)):
    processed = {}
    processed['text'] = line
    processed['sentiment'] = sid.polarity_scores(line)['compound']
    def filterpoliticians(name):
      if name in line:
        return name
    processed['subject'] = list(filter(lambda a: a in line, politicians))

    cnn.append(processed)

if cnn != []:
    cnn_sentiment.insert_many(cnn)


In [None]:
import json
import bz2
import nltk
import os
import pymongo

# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

client = pymongo.MongoClient("mongodb+srv://admin:1234@cluster0.wcn2l.mongodb.net/PenApps2020?retryWrites=true&w=majority")
db = client['PennApps2020']
fox_sentiment = db['fox_sentiment']

politicians = ['Biden', 'Trump', 'Kamala Harris', 'Mike Pence']
fox = []
for line in stories:
  if (any(ele in line for ele in politicians)):
    processed = {}
    processed['text'] = line
    processed['sentiment'] = sid.polarity_scores(line)['compound']
    def filterpoliticians(name):
      if name in line:
        return name
    processed['subject'] = list(filter(lambda a: a in line, politicians))

    fox.append(processed)

if fox != []:
    fox_sentiment.insert_many(fox)


In [6]:
import requests
from bs4 import BeautifulSoup
import urllib

stories = []

def getTheGoodStuff(newsstories):
    global stories
    for data in newsstories:
        htmlatag = data.find("h2", class_="title").find("a")
        headline = htmlatag.getText()
        url = htmlatag.get("href")
        d = {"headline" : headline,
             "url" : url}
        stories.append(d)

def scrapeWebsites():
    global stories
    
    # Getting stories from Fox News.
    foxnews = "http://www.foxnews.com/"
    r  = requests.get(foxnews)
    data = r.text
    soup = BeautifulSoup(data,"lxml")
    for i in range(0, 15):
        foundstories = soup.find_all("article", class_="article story-" + str(i))
        getTheGoodStuff(foundstories)
    
def displayStories():
    global stories
    for i in range(0, len(stories)):
        print(stories[i]["headline"])
    
scrapeWebsites()
displayStories()

President Trump on violence in Democrat-run cities, reversing Obama's agenda, debate preparation
Nancy Pelosi joins G7 summit leaders to warn of the ‘existential threat of our time'
President Trump on violence in Democrat-run cities, reversing Obama's agenda, debate preparation
Postal Service offering $50,000 reward for info related to mail carrier shooting
On the 19th anniversary of 9/11, join Pete Hegseth from the 9/11 Memorial as we remember the day that forever changed America.
US coronavirus-case count is highest in a week
What to do after paying off credit card debt
President Trump on COVID response, Bob Woodward's book, push for coronavirus vaccine, mail-in voting concerns
'Transsexual Satanist anarchist' wins GOP nod for sheriff in NH
President Trump on COVID response, Bob Woodward's book, push for coronavirus vaccine, mail-in voting concerns
TIM GRAHAM: How would liberal media report on coronavirus deaths under President Hillary Clinton?
Clemson athletics announces 24 new COVI

In [23]:
import requests
from bs4 import BeautifulSoup
import urllib

stories = []

def getTheGoodStuff(newsstories):
    global stories
    for data in newsstories:
        htmlatag = data.find("h2", class_="title").find("a")
        headline = htmlatag.getText()
        url = htmlatag.get("href")
        d = {"headline" : headline,
             "url" : url}
        stories.append(d)

def scrapeWebsites():
    global stories
    
    # Getting stories from Fox News.
    foxnews = "https://www.msnbc.com/"
    r  = requests.get(foxnews)
    data = r.text
    soup = BeautifulSoup(data,"lxml")
    for i in range(0, 15):
        foundstories = soup.find_all("article", class_="article story-" + str(i))
        getTheGoodStuff(foundstories)
    
def displayStories():
    global stories
    for i in range(0, len(stories)):
        print(stories[i]["headline"])

scrapeWebsites()
displayStories()

In [8]:
stories = list(map(lambda x: x['headline'], stories))

In [9]:
stories

["President Trump on violence in Democrat-run cities, reversing Obama's agenda, debate preparation",
 "Nancy Pelosi joins G7 summit leaders to warn of the ‘existential threat of our time'",
 "President Trump on violence in Democrat-run cities, reversing Obama's agenda, debate preparation",
 'Postal Service offering $50,000 reward for info related to mail carrier shooting',
 'On the 19th anniversary of 9/11, join Pete Hegseth from the 9/11 Memorial as we remember the day that forever changed America.',
 'US coronavirus-case count is highest in a week',
 'What to do after paying off credit card debt',
 "President Trump on COVID response, Bob Woodward's book, push for coronavirus vaccine, mail-in voting concerns",
 "'Transsexual Satanist anarchist' wins GOP nod for sheriff in NH",
 "President Trump on COVID response, Bob Woodward's book, push for coronavirus vaccine, mail-in voting concerns",
 'TIM GRAHAM: How would liberal media report on coronavirus deaths under President Hillary Clinto