Skip to content
Sentiment analysis of the Twitter activity of various news outlets
Jupyter Notebook
Branch: master
Clone or download
Fetching latest commit…
Cannot retrieve the latest commit at this time.
Permalink
Type Name Latest commit message Commit time
Failed to load latest commit information.
Output
.gitignore
Instructions.md
News_sentiment.ipynb
README.md

README.md

News sentiment analysis

All comments are inside the document. 3 trends:

  1. CNN sends tweets with more negative polarity than other sources. CBS is the source of the most positive tweets.
  2. CNN and FoxNews both have the highest negative rates, but also the highest 'Like' rate, means that people like the way theese news sources deliver their opinions.
  3. Daily tweet sentiment comparison shows that very often when CBS and BBC have a positive tweet polarity, CNN, NPR and FoxNews are on the negative side. This research needs some more data.
import tweepy
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
from config import *
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
import numpy as np
import seaborn as sns
from datetime import datetime

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())
# Set the news sources list
targets = ['@BBC', '@CBS', '@CNN', '@FoxNews', '@nytimes', '@NPR']
# Get 100 tweets data from each account

# Variable to store emotional lists from each source
total_mood = []

for target in targets:
    
    last_tweet = None
    
    tweet_counter = 0
    
    for x in range(5):
        
        all_data = api.user_timeline(target, count=20, max_id=last_tweet, page=x)
            
        for tweet in all_data:
            
            emotions = analyzer.polarity_scores(tweet['text'])
            
            total_mood.append({'source': target,
                             'compound': emotions['compound'],
                             'positive': emotions['pos'],
                             'negative': emotions['neg'],
                             'neutral': emotions['neu'],
                             'tweets_ago': tweet_counter,
                             'text': tweet['text'],
                             'time': tweet['created_at'],
                              'likes': tweet['favorite_count'],
                              'RT': tweet['retweet_count']})
            
            tweet_counter -= 1

        last_tweet = tweet["id"] - 1
    
len(total_mood)
600
# Create a DF from the received data
df = pd.DataFrame(total_mood)
df.head(5)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
RT compound likes negative neutral positive source text time tweets_ago
0 9 0.0000 27 0.000 1.000 0.000 @BBC 🌶🤯 A man who ate the world's hottest chilli pe... Tue Apr 10 16:58:05 +0000 2018 0
1 32 0.0000 0 0.000 1.000 0.000 @BBC RT @bbccomedy: Henry of Eight, the Tudor Kim K... Tue Apr 10 16:34:54 +0000 2018 -1
2 5 0.0000 21 0.000 1.000 0.000 @BBC 👭 Ten celebrity pairs who look so freakily ali... Tue Apr 10 16:32:08 +0000 2018 -2
3 15 0.4227 56 0.150 0.514 0.336 @BBC 😱 That's quite the party trick! \n#Doodlebugs ... Tue Apr 10 16:02:03 +0000 2018 -3
4 10 0.6705 0 0.066 0.726 0.208 @BBC RT @TWBBC: "Our liberty is at risk when we han... Tue Apr 10 15:47:43 +0000 2018 -4
# Look at each news source separately
plt.rcParams.update(plt.rcParamsDefault) # Set default plot style
sentilist = ['compound', 'positive', 'negative']
current_date = datetime.now().date().strftime("%m.%d.%Y")
for i in range(len(sentilist)):
    sns.factorplot(data=df, x="tweets_ago", y=sentilist[i], col="source", hue='source')
    plt.title(f'Sentiment comparison by {sentilist[i]} ({current_date})')
    plt.savefig(f'Output/Sentiment_comparison_by_{sentilist[i]}_on_{current_date}.png')

From theese graphs we can see, that CBS and BBC are generally more positive than the other sources. CNN on the other hand, is mostly on the negative side.

# Let's take a closer look on the distribution of the each sentiment for each user, using Bar charts.

plt.figure(figsize=(13,9))
for x in range(len(sentilist)):
    plt.subplot(2,2,x+1)
    ax = sns.barplot('source', sentilist[x], data=df, linewidth=1, edgecolor=".1")
    ax.set_title(f'Average {sentilist[x]} rate for each news source {current_date}')
    ax.set_xlabel('')
    for p in ax.patches:
        ax.text(p.get_x()+p.get_width()/2., p.get_height()*0.1, '{:1.3f}'.format(p.get_height()), ha="center")
        ax.set_xticklabels(ax.get_xticklabels(),rotation=15)
plt.savefig(f'Output/Sentiment_distribution_by_source_on_{current_date}.png')

Our assumptions were confirmed: CBS is a leader in positive tweets creating, BBC is on the second place. NY Times and NPR are close to neutral overage score. CNN has the highest negative rate and FoxNews is the second major source of the negative tweets.

It is interesting to see which source has more likes and retweets.

newlist = ['likes', 'RT']
plt.figure(figsize=(10,5))
for x in range(len(newlist)):
    plt.subplot(1,2,x+1)
    ax = sns.barplot('source', newlist[x], data=df, linewidth=1, edgecolor=".1")
    ax.set_title(f'Number of {newlist[x]} for each news source')
    ax.set_xlabel('')
    for p in ax.patches:
        ax.text(p.get_x()+p.get_width()/2., 10, '{:1.0f}'.format(p.get_height()), ha="center", color='w', weight='bold')
        ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
plt.savefig(f'Output/Likes_and_RT_by_source.png')

Interesting correlation - CNN and FoxNews with their negative tweets have significantly more likes, than positive CBS and BBC. Speaking about NPR, seems like it's negative polarity is much different from FoxNews's because people don't like it a lot. Also FoxNews and CNN are the sources with the highest retweet rate, what we can't say about NPR and CBS.

Below I want to compare the tweet creation date and the overal emotion rate during this date for each source

It seems like some sources send their tweets more often than others, hence I need to adjust the tweets amount, to make the data compatible

total_mood2 = []

for target in targets:
    
    #print(target)
    last_tweet = None
    
    tweet_counter = 0
    
    for x in range(10):
        
        if target == '@NPR':
            all_data = api.user_timeline(target, count=35, max_id=last_tweet, page=x)
        elif target == '@nytimes':
            all_data = api.user_timeline(target, count=50, max_id=last_tweet, page=x)
        elif target == '@CNN':
            all_data = api.user_timeline(target, count=70, max_id=last_tweet, page=x)
        elif target == '@FoxNews':
            all_data = api.user_timeline(target, count=130, max_id=last_tweet, page=x)
        elif target == '@BBC':
            all_data = api.user_timeline(target, count=10, max_id=last_tweet, page=x)
        else:
            all_data = api.user_timeline(target, count=2, max_id=last_tweet, page=x)
        #print(len(all_data))    
            
        for tweet in all_data:
            
            emotions = analyzer.polarity_scores(tweet['text'])
            
            total_mood2.append({'source': target,
                             'compound': emotions['compound'],
                             'positive': emotions['pos'],
                             'negative': emotions['neg'],
                             'neutral': emotions['neu'],
                             'tweets_ago': tweet_counter,
                             'text': tweet['text'],
                             'time': tweet['created_at'],
                              'likes': tweet['favorite_count'],
                              'RT': tweet['retweet_count']})
            
            tweet_counter -= 1

        last_tweet = tweet["id"] - 1
        
len(total_mood2)
2560
df2 = pd.DataFrame(total_mood2)
df2.head(5)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
RT compound likes negative neutral positive source text time tweets_ago
0 1 -0.5106 4 0.216 0.784 0.000 @BBC Meet Rebekah - a former professional footballe... Tue Apr 10 18:00:27 +0000 2018 0
1 10 0.0000 29 0.000 1.000 0.000 @BBC 🌶🤯 A man who ate the world's hottest chilli pe... Tue Apr 10 16:58:05 +0000 2018 -1
2 34 0.0000 0 0.000 1.000 0.000 @BBC RT @bbccomedy: Henry of Eight, the Tudor Kim K... Tue Apr 10 16:34:54 +0000 2018 -2
3 5 0.0000 22 0.000 1.000 0.000 @BBC 👭 Ten celebrity pairs who look so freakily ali... Tue Apr 10 16:32:08 +0000 2018 -3
4 15 0.4227 59 0.150 0.514 0.336 @BBC 😱 That's quite the party trick! \n#Doodlebugs ... Tue Apr 10 16:02:03 +0000 2018 -4
# The code below converts twitter date format to the regular dates and adds it to a new column.
# Converting code was taken from StackOverflow and I haven't figured out how it works yet.

import re
datedf = df2.copy()
datedf['conv_time'] = ''
for ind, row in datedf.iterrows():
    twitter_time = row['time']
    remove_ms = lambda x:re.sub("\+\d+\s","",x) # some lambda magic
    mk_dt = lambda x:datetime.strptime(remove_ms(x), "%a %b %d %H:%M:%S %Y") # some lambda magic
    my_form = lambda x:"{:%m-%d-%y}".format(mk_dt(x)) # some lambda magic
    datedf.at[ind, 'conv_time'] = my_form(twitter_time)
datedf.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
RT compound likes negative neutral positive source text time tweets_ago conv_time
0 1 -0.5106 4 0.216 0.784 0.000 @BBC Meet Rebekah - a former professional footballe... Tue Apr 10 18:00:27 +0000 2018 0 04-10-18
1 10 0.0000 29 0.000 1.000 0.000 @BBC 🌶🤯 A man who ate the world's hottest chilli pe... Tue Apr 10 16:58:05 +0000 2018 -1 04-10-18
2 34 0.0000 0 0.000 1.000 0.000 @BBC RT @bbccomedy: Henry of Eight, the Tudor Kim K... Tue Apr 10 16:34:54 +0000 2018 -2 04-10-18
3 5 0.0000 22 0.000 1.000 0.000 @BBC 👭 Ten celebrity pairs who look so freakily ali... Tue Apr 10 16:32:08 +0000 2018 -3 04-10-18
4 15 0.4227 59 0.150 0.514 0.336 @BBC 😱 That's quite the party trick! \n#Doodlebugs ... Tue Apr 10 16:02:03 +0000 2018 -4 04-10-18
grouped_date = datedf.groupby(['conv_time', 'source'])
grouped_df = grouped_date.mean().reset_index('source')
grouped_df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
source RT compound likes negative neutral positive tweets_ago
conv_time
03-07-18 @CBS 35.000000 -0.101150 11.500000 0.115000 0.8095 0.075000 -18.5
03-16-18 @CBS 24.000000 0.325700 6.000000 0.000000 0.9110 0.089000 -16.5
03-20-18 @BBC 45.400000 0.120520 71.200000 0.043300 0.8609 0.095800 -94.5
03-20-18 @CNN 305.833333 -0.191357 542.666667 0.112733 0.8464 0.040867 -664.5
03-21-18 @CNN 572.750000 -0.137655 961.850000 0.082600 0.8785 0.038900 -639.5

CBS makes to much noise with it's Super positive tweets, so I exclude it from the data

nocbs = grouped_df.reset_index()
nocbs_df = nocbs[nocbs['source'] !='@CBS'].set_index('conv_time')
nocbs_df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
source RT compound likes negative neutral positive tweets_ago
conv_time
03-20-18 @BBC 45.400000 0.120520 71.200000 0.043300 0.860900 0.095800 -94.5
03-20-18 @CNN 305.833333 -0.191357 542.666667 0.112733 0.846400 0.040867 -664.5
03-21-18 @CNN 572.750000 -0.137655 961.850000 0.082600 0.878500 0.038900 -639.5
03-21-18 @NPR 67.057143 -0.014871 121.342857 0.066429 0.872429 0.061171 -332.0
03-21-18 @nytimes 162.500000 0.004614 333.642857 0.068643 0.873000 0.058214 -492.5
plt.rcParams.update(plt.rcParamsDefault)
plt.figure(figsize=(13,20))
for i in range(len(sentilist)):
    plt.subplot(4,1,i+1)
    ax = sns.barplot(x=nocbs_df.index, y=sentilist[i], hue='source', data=nocbs_df, ci=None)
    ax.set_xticklabels(ax.get_xticklabels(),rotation=20)
    ax.set_xlabel('')
    ax.set_title(f'{sentilist[i]} comparison over time on {current_date}')
    ax.grid(ls='dotted', linewidth=1.5)
plt.savefig(f'Output/Daily_sentiment_comparison_on_{current_date}.png')

Generally we can see the same trends, that were noticed before: same day tweets from BBS are more positive comparing to CNN, FoxNews and NPR. Theese plots need more research. One way is to get a specific headline and compare the way it was interpreted by different sources.

# Colored dots, what can be better?
sns.set()
sns.lmplot(x='tweets_ago', y='compound', data=df, hue="source", fit_reg=False, scatter=True, size=10,
           scatter_kws={"s":85})
plt.ylabel('Tweet Polarity')
plt.title(f'Sentiment analysis of media tweets ({current_date})')
plt.savefig(f'Output/Sentiment_analysis_of_media_tweets({current_date}).png')
You can’t perform that action at this time.