# Sentiment Analysis For News Articles
### Imports

In [34]:
import config
import requests
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import re, string
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
import pandas as pd
from io import StringIO

In [35]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

### Tokenizing

In [36]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
print(tweet_tokens[0])

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


In [37]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

### Normalization

see: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html for meanings

In [38]:
print(pos_tag(tweet_tokens[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


#### Lemmatization

In [39]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

In [40]:
print(lemmatize_sentence(tweet_tokens[0]))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


### Removing Noise

remove hyperlinks, twitter handles, punctuation & special characters

In [41]:
def remove_noise(tweet_tokens, stop_words = ()):
    cleaned_tokens = []
    
    for token, tag in pos_tag(tweet_tokens):
        token =  re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)", "", token)
        
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
    
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
                cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [42]:
stop_words = stopwords.words('english')
print(remove_noise(tweet_tokens[0], stop_words))

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [43]:
positive_cleaned = []
negative_cleaned = []
for token in positive_tweet_tokens:
    positive_cleaned.append(remove_noise(token, stop_words))
for token in negative_tweet_tokens:    
    negative_cleaned.append(remove_noise(token, stop_words))

In [44]:
print(positive_tweet_tokens[500])
print(positive_cleaned[500])
print(negative_tweet_tokens[500])
print(negative_cleaned[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', '#fanart', ':d']
['Can', 'u', 'feel', 'it', '?', ':(', '(:', '(', '#exo', 'http://t.co/ghsa262ORm']
['u', 'feel', ':(', '(:', '#exo']


### Word Density

In [45]:
def get_all_words(cleaned_tokens):
    for tokens in cleaned_tokens:
        for token in tokens:
            yield token

In [46]:
all_pos_words = get_all_words(positive_cleaned)

In [47]:
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


### Prepare Data

In [48]:
def get_tweets_for_model(cleaned_tokens):
    for tweet_tokens in cleaned_tokens:
        yield dict([token, True] for token in tweet_tokens)

In [49]:
positive_tokens_for_model = get_tweets_for_model(positive_cleaned)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned)

In [50]:
import random
positive_dataset = [(tweet_dict, "Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative") for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

### Build Model

In [51]:
classifier = NaiveBayesClassifier.train(train_data)

In [52]:
print("accuracy: {}".format(classify.accuracy(classifier, test_data)))
classifier.show_most_informative_features(10)

accuracy: 0.9933333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2075.2 : 1.0
                      :) = True           Positi : Negati =   1649.7 : 1.0
                follower = True           Positi : Negati =     22.4 : 1.0
                     bam = True           Positi : Negati =     20.9 : 1.0
                     sad = True           Negati : Positi =     18.9 : 1.0
                    glad = True           Positi : Negati =     18.9 : 1.0
                     x15 = True           Negati : Positi =     16.4 : 1.0
                     via = True           Positi : Negati =     16.3 : 1.0
                 welcome = True           Positi : Negati =     14.3 : 1.0
                     ugh = True           Negati : Positi =     13.8 : 1.0


In [53]:
import pickle
f = open('classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [20]:
custom = "rip"
custom_tokens = remove_noise(word_tokenize(custom))
print(classifier.classify(dict([token,True] for token in custom_tokens)))
dist = classifier.prob_classify(dict([token, True] for token in custom_tokens))
print(dist.prob("Negative"))

Negative
0.7307692628772023


In [21]:
import config

In [22]:
api_key = config.API_KEY
url = (r'https://newsapi.org/v2/everything?q=happy&from=2020-07-10&to=2020-07-19&sortBy=popularity&apiKey=' + api_key)
response = requests.get(url).json()

In [28]:
print(response)

{'status': 'ok', 'totalResults': 14316, 'articles': [{'source': {'id': None, 'name': 'Lifehacker.com'}, 'author': 'Claire Lower on Skillet, shared by Claire Lower to Lifehacker', 'title': 'Turn Your Frappucino Into an Alcoholic Slushie', 'description': 'Bottled Frappuccinos will never fail to transport me to the year 1999. I am 13 years old—all Clinique Happy and butterfly clips—and sipping a chilled Mocha coffee drink from a glass bottle seems like the height of adult sophistication. I hadn’t really thought…', 'url': 'https://skillet.lifehacker.com/turn-your-frappucino-into-an-alcoholic-slushie-1844380775', 'urlToImage': 'https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,fl_progressive,g_center,h_675,pg_1,q_80,w_1200/hkjnyc9g4fdywpvxkytw.jpg', 'publishedAt': '2020-07-15T13:30:00Z', 'content': 'Bottled Frappuccinos will never fail to transport me to the year 1999. I am 13 years oldall Clinique Happy and butterfly clipsand sipping a chilled Mocha coffee drink from a glass 

In [24]:
def getProbs(text):
    tokens = remove_noise(word_tokenize(text))
    output = classifier.classify(dict([token, True] for token in tokens))
    dist = classifier.prob_classify(dict([token, True] for token in tokens))
    #return (output, dist.prob("Positive"))
    return dist.prob("Positive")

In [29]:
df = pd.DataFrame(columns=["Title", "Content", "URL", "imgURL", "Sentiment"])
for article in response["articles"]:
    overallSent = (getProbs(article["title"]) + getProbs(article["content"]) + getProbs(article["description"]))/3
    to_append = [article["title"], article["content"], article["url"], article["urlToImage"], overallSent]
    df.loc[len(df)] = to_append
df = df.sort_values(by=['Sentiment'])
df = df.reset_index(drop=False)

In [32]:
for i in range(len(df)-1, len(df)-6, -1):
    print(df.loc[i]["URL"])
    print(df.loc[i]["imgURL"])

https://mashable.com/article/sir-ian-mckellen-birthday-sonnet-patrick-stewart/
https://mondrian.mashable.com/2020%252F07%252F14%252F82%252F57c022de59fb45cc864c0a6881008a8e.1200a.jpg%252F1200x630.jpg?signature=VrolWbpTOKYid5z7IFZF8GB2WKY=
https://www.engadget.com/kano-build-yourself-windows-10-pc-coding-relaunch-130016706.html
https://o.aolcdn.com/images/dims?resize=1200%2C630&crop=1200%2C630%2C0%2C0&quality=95&image_uri=https%3A%2F%2Fs.yimg.com%2Fos%2Fcreatr-uploaded-images%2F2020-07%2Fb35cf710-c5b8-11ea-b7ea-017285b2969b&client=amp-blogside-v2&signature=986ec080123439942cf3df9869924edd9e279666
https://mashable.com/article/chance-the-rapper-kanye-west-2020/
https://mondrian.mashable.com/2020%252F07%252F13%252Fcd%252Fe7d5f4c5fd7446a2b3679e5b0fda9716.75452.jpg%252F1200x630.jpg?signature=rvTgZ9gsK2JxAep4siVsc7V2FyA=
https://skillet.lifehacker.com/turn-your-frappucino-into-an-alcoholic-slushie-1844380775
https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,fl_progressive,g_cente

In [31]:
df

Unnamed: 0,index,Title,Content,URL,imgURL,Sentiment
0,10,Dozens of U.S. Marines Test Positive For Covid...,At least 62 U.S. Marines have tested positive ...,https://gizmodo.com/dozens-of-u-s-marines-test...,https://i.kinja-img.com/gawker-media/image/upl...,0.056106
1,12,How to Take Impossible Burgers to the Next Lev...,"Last night, I logged into a Zoom cooking class...",https://gizmodo.com/how-to-take-impossible-bur...,https://i.kinja-img.com/gawker-media/image/upl...,0.135095
2,15,"Almost 20 years on, Ivanisevic’s Wimbledon sto...",Every sport has stories of underdogs who have ...,https://mashable.com/ad/article/ivanisevic-wim...,https://mondrian.mashable.com/2020%252F07%252F...,0.149794
3,3,What's Coming to Hulu in August 2020,Early in lockdownwhen I was trying to tell mys...,https://lifehacker.com/whats-coming-to-hulu-in...,https://i.kinja-img.com/gawker-media/image/upl...,0.301395
4,17,Seniors recreating famous album covers is your...,Residents of senior care facilities around the...,https://mashable.com/article/senior-care-facil...,https://mondrian.mashable.com/2020%252F07%252F...,0.387102
5,13,"Yes, You Should Still Change Your DNS Settings...",The DNS (Domain Name System) server settings o...,https://gizmodo.com/yes-you-should-still-chang...,https://i.kinja-img.com/gawker-media/image/upl...,0.418243
6,6,Netflix says 'Extraction' and 'Bird Box' are i...,Of interest is that while Netflix’s commission...,https://www.engadget.com/netflix-top-10-origin...,https://o.aolcdn.com/images/dims?resize=1200%2...,0.433266
7,11,Researchers Created an App That Monitors Your ...,Even though I appreciate that its for a good c...,https://gizmodo.com/researchers-created-an-app...,https://i.kinja-img.com/gawker-media/image/upl...,0.445912
8,8,Hitting the Books: How to huck a human into lo...,The phrase orbital mechanics—like nuclear part...,https://www.engadget.com/hitting-the-books-shu...,https://o.aolcdn.com/images/dims?resize=1200%2...,0.473907
9,18,Unlock American Netflix with this discounted VPN,TL;DR: A three-year subscription to Surfshark ...,https://mashable.com/uk/shopping/july-13-surfs...,https://mondrian.mashable.com/2020%252F07%252F...,0.496291
