This notebook demonstrates sentiment analysis and topic modeling on a selection of tweets from Twitter. The tweets were pulled on October 16, 2017, via the Twitter API, using the search term "Tesla" (the car company).  The tweets were tokenized, evaluated for positive/negative sentiment and broken down by topics and further analyzed (e.g., for hashtags).   

In [3]:
import tweepy
import numpy as np
import pandas as pd
from collections import defaultdict, Counter 

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

import json
import time

Collect tweets and store them in a dataframe 

In [5]:
# Need to add in personal token/keys/secrets
access_token = 'ACCESS TOKEN'
access_token_secret = 'ACCESS TOKEN SECRET'
consumer_key = 'CONSUMER KEY'
consumer_secret = 'CONSUMER SECRET'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [6]:
def get_tweets(api, input_query):
    for tweet in tweepy.Cursor(api.search, q=input_query, lang="en").items():
        yield tweet

input_queries = ['Tesla']
tweets = {}
for input_query in input_queries:
    tweets[input_query] = get_tweets(api, input_query)

In [7]:
dataset = defaultdict(list)
download_tweet_count = 1500
for input_query in input_queries:
    print(input_query)
    counter = 0
    while counter < download_tweet_count:
        try:
            tweet = next(tweets[input_query])
            dataset['topic'].append(input_query)
            dataset['id'].append(tweet.id)
            # user related information
            dataset['username'].append(tweet.author.screen_name)
            dataset['name'].append(tweet.author.name)
            dataset['user_followers_count'].append(tweet.author.followers_count)
            dataset['user_friends_count'].append(tweet.author.friends_count)
            # tweet related information
            dataset['text'].append(tweet.text)
            dataset['created_at'].append(tweet.created_at.strftime("%Y-%m-%d %H:%M:%S"))
            dataset['favorite_count'].append(tweet.favorite_count)
            dataset['retweet_count'].append(tweet.retweet_count)
            # some extracted data from tweet
            dataset['hashtags'].append(
                ','.join([ht['text'] 
                          for ht in tweet.entities['hashtags']]))
            dataset['mentioned_urls'].append(
                ','.join([url['url'] 
                          for url in tweet.entities['urls']]))
            dataset['mentioned_user_ids'].append(
                ','.join([mention['id_str'] 
                          for mention in tweet.entities['user_mentions']]))
            dataset['mentioned_user_names'].append(
                ','.join([mention['screen_name'] 
                          for mention in tweet.entities['user_mentions']]))
            counter +=1
            if counter == download_tweet_count:
                break
        except:
            print(len(dataset['id']))
            print('Sleeping for 15 minutes')
            time.sleep(15*60) # sleep for 15 minutes

Tesla


In [8]:
# Put in dataframe and view selected rows/columns 
df_Tesla = pd.DataFrame.from_dict(dataset)
df_Tesla[['created_at','text','hashtags','username','user_followers_count','topic']].head()

Unnamed: 0,created_at,text,hashtags,username,user_followers_count,topic
0,2017-10-16 23:54:32,This F-350 might as well be a tv show — oh for...,,tesla_ebooks,67,Tesla
1,2017-10-16 23:54:31,Tesla unveils a brand new dual-charging port f...,,uohanalilly,9116,Tesla
2,2017-10-16 23:54:27,RT @Forbes: 4 takeaways from Tesla firing hund...,,LB_Hudson,648,Tesla
3,2017-10-16 23:54:26,@pcanella It’s so obvious haha. Like today I w...,,toddbodene,550,Tesla
4,2017-10-16 23:54:18,#pinkpelicanshoutout to @Team5937 Renaissance ...,pinkpelicanshoutout,PinkPelicansFTC,138,Tesla


Initial Sentiment Analysis 

In [9]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [10]:
# Vader Compound scores by tweet text 
sid = SentimentIntensityAnalyzer()
vader_compound = []
for i in df_Tesla.text : 
    ss=sid.polarity_scores(i)['compound']
    vader_compound.append(ss)
df_Tesla['Vader_Compound']=vader_compound 
vader_compound_average = df_Tesla['Vader_Compound'].mean()
vader_compound_average

0.05881499999999996

In [11]:
# Vader Negative scores by tweet text 
sid = SentimentIntensityAnalyzer()
vader_neg = []
for i in df_Tesla.text : 
    ss=sid.polarity_scores(i)['neg']
    vader_neg.append(ss)
df_Tesla['Vader_Negative']=vader_neg 
vader_neg_average = df_Tesla['Vader_Negative'].mean()
vader_neg_average

0.04759

In [12]:
# Vader Neutral scores by tweet text 
sid = SentimentIntensityAnalyzer()
vader_neu = []
for i in df_Tesla.text: 
    ss=sid.polarity_scores(i)['neu']
    vader_neu.append(ss)
df_Tesla['Vader_Neutral']=vader_neu 
vader_neutral_average = df_Tesla['Vader_Neutral'].mean()
vader_neutral_average

0.8825553333333345

In [13]:
# Vader Positive scores by tweet text 
sid = SentimentIntensityAnalyzer()
vader_pos = []
for i in df_Tesla.text : 
    ss=sid.polarity_scores(i)['pos']
    vader_pos.append(ss)
df_Tesla['Vader_Positive']=vader_pos
vader_positive_average = df_Tesla['Vader_Positive'].mean()
vader_positive_average

0.06985066666666667

In [14]:
print('Vader averages for Tesla Oct 16 tweets: ' + '\n' 
      'Vader compound: ' + str(vader_compound_average) + '\n'
      'Vader negative: ' + str(vader_neg_average) + '\n'
      'Vader neutral:  ' + str(vader_neutral_average) + '\n'
      'Vader positive: ' + str(vader_positive_average) + '\n')  

Vader averages for Tesla Oct 16 tweets: 
Vader compound: 0.05881499999999996
Vader negative: 0.04759
Vader neutral:  0.8825553333333345
Vader positive: 0.06985066666666667



Based on analysis of raw text, Tesla's tweets are mostly neutral in tone, with slightly more positive than negative sentiments expressed.  The tweets were pulled at a time when Tesla was in the news for employee layoffs and production delays as well as for an effort to help Puerto Rico recover from Hurricane Maria by shipping its Powerpacks to hospitals and other critical facilities. 

In [15]:
# Tokenize the text 
all_tweets = df_Tesla['text'].values
print(all_tweets[2])

RT @Forbes: 4 takeaways from Tesla firing hundreds of people: https://t.co/HwarKVcEEJ https://t.co/ZlcuJBHpfx


In [16]:
# Text tokenization
import string
exclude = set(string.punctuation)
tokenized_all_tweets = []
tokenizer = TweetTokenizer()
for tweet in all_tweets :
    tokens = tokenizer.tokenize(tweet.lower())
    tokenized_all_tweets.append(''.join([ch for ch in ' '.join(tokens) if ch not in exclude]).split())
print(tokenized_all_tweets[2])

['rt', 'forbes', '4', 'takeaways', 'from', 'tesla', 'firing', 'hundreds', 'of', 'people', 'httpstcohwarkvceej', 'httpstcozlcujbhpfx']


In [17]:
# Stop-word removal 
sws = set(stopwords.words('english')+ ['rt', u'\u2026', u'\u2019' ]) # Added tweet-specific terms for removal 
sws_removed_all_tweets = []
for j,sent in enumerate(tokenized_all_tweets):
    sws_removed_all_tweets.append([i for i in sent if i not in sws])
print(sws_removed_all_tweets[2])

['forbes', '4', 'takeaways', 'tesla', 'firing', 'hundreds', 'people', 'httpstcohwarkvceej', 'httpstcozlcujbhpfx']


In [18]:
# Word stemming
from nltk.stem.porter import *
stemmer = PorterStemmer()
stemmed = []
for j,sent in enumerate(sws_removed_all_tweets) :
    stemmed.append([stemmer.stem(i) for i in sent])
print(stemmed[2])    


['forb', '4', 'takeaway', 'tesla', 'fire', 'hundr', 'peopl', 'httpstcohwarkvceej', 'httpstcozlcujbhpfx']


In [19]:
# Attach tokens to the dataframe 
df_Tesla['tokens'] =stemmed
df_Tesla.head(2)

Unnamed: 0,created_at,favorite_count,hashtags,id,mentioned_urls,mentioned_user_ids,mentioned_user_names,name,retweet_count,text,topic,user_followers_count,user_friends_count,username,Vader_Compound,Vader_Negative,Vader_Neutral,Vader_Positive,tokens
0,2017-10-16 23:54:32,0,,920075519177121792,,,,alseT,0,This F-350 might as well be a tv show — oh for...,Tesla,67,10,tesla_ebooks,0.2732,0.0,0.884,0.116,"[f, 350, might, well, tv, show, —, oh, forev, ..."
1,2017-10-16 23:54:31,0,,920075513309335553,https://t.co/XxAk0viIQW,,,Uzzi Ohana,0,Tesla unveils a brand new dual-charging port f...,Tesla,9116,7114,uohanalilly,0.0,0.0,1.0,0.0,"[tesla, unveil, brand, new, dualcharg, port, c..."


Topic Modeling 

In [None]:
# Gensim Library
from gensim import corpora, models
dictionary = corpora.Dictionary(sws_removed_all_tweets)

In [21]:
# Next, our dictionary must be converted into a bag-of-words:
corpus_Tesla = [dictionary.doc2bow(text) for text in sws_removed_all_tweets]

In [22]:
print(corpus_Tesla[2])

[(14, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1)]


In [23]:
# LDA Model 
ldamodel_Tesla= models.ldamodel.LdaModel(corpus_Tesla, num_topics=3, id2word = dictionary, passes=20)

In [24]:
ldamodel_Tesla.print_topics(num_topics=3, num_words=7)

[(0,
  '0.072*"tesla" + 0.047*"puerto" + 0.045*"rico" + 0.032*"powerpacks" + 0.027*"reportedly" + 0.023*"shipped" + 0.008*"trump"'),
 (1,
  '0.049*"tesla" + 0.011*"car" + 0.011*"time" + 0.010*"company" + 0.010*"work" + 0.010*"technology" + 0.009*"nothing"'),
 (2,
  '0.069*"tesla" + 0.016*"hundreds" + 0.009*"model" + 0.009*"workers" + 0.007*"people" + 0.006*"solar" + 0.006*"city"')]

In [25]:
# Determining the primary topic for each tweet 
topic_ratings_Tesla_tweets = []
for i in range(len(corpus_Tesla)) :
    ratings = ldamodel_Tesla.get_document_topics(corpus_Tesla[i])
    topic_ratings_Tesla_tweets.append(ratings)
topic_ratings_Tesla_tweets[0:5] 

[[(0, 0.76020290426846937),
  (1, 0.21669543729371346),
  (2, 0.023101658437817273)],
 [(0, 0.035495829775179739),
  (1, 0.036484506967565257),
  (2, 0.92801966325725493)],
 [(0, 0.033880882928287989),
  (1, 0.033757698872328205),
  (2, 0.93236141819938378)],
 [(0, 0.95131256434257139),
  (1, 0.023342839698279796),
  (2, 0.025344595959148775)],
 [(0, 0.024284461334445454),
  (1, 0.95143490815363896),
  (2, 0.024280630511915702)]]

In [26]:
from operator import itemgetter 

topic_codes_Tesla_tweets = []

for i in topic_ratings_Tesla_tweets :
   code = max(i, key=itemgetter(1))[0]
   topic_codes_Tesla_tweets.append(code)

topic_codes_Tesla_tweets[:5]  
    

[0, 2, 2, 0, 1]

In [27]:
# Append topic codes to dataframe 
df_Tesla['topic_codes']=topic_codes_Tesla_tweets
df_Tesla.head()

Unnamed: 0,created_at,favorite_count,hashtags,id,mentioned_urls,mentioned_user_ids,mentioned_user_names,name,retweet_count,text,topic,user_followers_count,user_friends_count,username,Vader_Compound,Vader_Negative,Vader_Neutral,Vader_Positive,tokens,topic_codes
0,2017-10-16 23:54:32,0,,920075519177121792,,,,alseT,0,This F-350 might as well be a tv show — oh for...,Tesla,67,10,tesla_ebooks,0.2732,0.0,0.884,0.116,"[f, 350, might, well, tv, show, —, oh, forev, ...",0
1,2017-10-16 23:54:31,0,,920075513309335553,https://t.co/XxAk0viIQW,,,Uzzi Ohana,0,Tesla unveils a brand new dual-charging port f...,Tesla,9116,7114,uohanalilly,0.0,0.0,1.0,0.0,"[tesla, unveil, brand, new, dualcharg, port, c...",2
2,2017-10-16 23:54:27,0,,920075497945686016,https://t.co/HwarKVcEEJ,91478624.0,Forbes,L.B. Hudson,19,RT @Forbes: 4 takeaways from Tesla firing hund...,Tesla,648,361,LB_Hudson,-0.34,0.194,0.806,0.0,"[forb, 4, takeaway, tesla, fire, hundr, peopl,...",2
3,2017-10-16 23:54:26,0,,920075493059072001,,8685732.0,pcanella,Todd Bodene,0,@pcanella It’s so obvious haha. Like today I w...,Tesla,550,738,toddbodene,0.722,0.0,0.792,0.208,"[pcanella, obviou, haha, like, today, follow, ...",0
4,2017-10-16 23:54:18,0,pinkpelicanshoutout,920075460389801984,https://t.co/DafTRy34qL,2170839804.0,Team5937,PinkPelicanRobotics,0,#pinkpelicanshoutout to @Team5937 Renaissance ...,Tesla,138,230,PinkPelicansFTC,0.5229,0.0,0.816,0.184,"[pinkpelicanshoutout, team5937, renaiss, robot...",1


In [33]:
# Vader Compound scores by tweet text by topic
df1 = df_Tesla.loc[df_Tesla['topic_codes'] == 0]

sid = SentimentIntensityAnalyzer()
vader_totals = []
for i in df1.text : 
    ss=sid.polarity_scores(i)['compound']
    vader_totals.append(ss)
vader_compound = np.array(vader_totals)
print('Vader compound score for Topic 1: ' + str(vader_compound.mean()) + '\n')
print('Topic words :', ldamodel_Tesla.print_topic(0)) 

Vader compound score for Topic 1: 0.0272567692308

Topic words : 0.072*"tesla" + 0.047*"puerto" + 0.045*"rico" + 0.032*"powerpacks" + 0.027*"reportedly" + 0.023*"shipped" + 0.008*"trump" + 0.008*"elon" + 0.007*"donald" + 0.007*"shipping"


In [34]:
df2 = df_Tesla.loc[df_Tesla['topic_codes'] == 1]

sid = SentimentIntensityAnalyzer()
vader_totals = []
for i in df2.text : 
    ss=sid.polarity_scores(i)['compound']
    vader_totals.append(ss)
vader_compound = np.array(vader_totals)
print('Vader compound score for Topic 2: ' + str(vader_compound.mean()) + '\n')
print('Topic words :', ldamodel_Tesla.print_topic(1)) 

Vader compound score for Topic 2: 0.209445757576

Topic words : 0.049*"tesla" + 0.011*"car" + 0.011*"time" + 0.010*"company" + 0.010*"work" + 0.010*"technology" + 0.009*"nothing" + 0.009*"waste" + 0.009*"hard" + 0.009*"marvel"


In [35]:
df3 = df_Tesla.loc[df_Tesla['topic_codes'] == 2]

sid = SentimentIntensityAnalyzer()
vader_totals = []
for i in df3.text : 
    ss=sid.polarity_scores(i)['compound']
    vader_totals.append(ss)
vader_compound = np.array(vader_totals)
print('Vader compound score for Topic 3: ' + str(vader_compound.mean()) + '\n')
print('Topic words :', ldamodel_Tesla.print_topic(2)) 

Vader compound score for Topic 3: 0.00267019230769

Topic words : 0.069*"tesla" + 0.016*"hundreds" + 0.009*"model" + 0.009*"workers" + 0.007*"people" + 0.006*"solar" + 0.006*"city" + 0.005*"firing" + 0.005*"employees" + 0.005*"spacex"


Not surprisingly, the third topic, which captures the employee layoffs, had the lowest sentiment score.  The first topic, capturing the sending of Powerpacks to Puerto Rico, was more positive. The second topic, which seems to capture Tesla's general innovation, yielded the highest sentiment score.  See below for the most commom hashtags by topic, which adds further context to the above.    

In [31]:
# Count most common hastags by topic 

In [36]:
hashtags = []
for i in df1.hashtags :
    tokens = nltk.word_tokenize(i) 
    hashtags.extend(tokens)
hashtags = [hashtag for hashtag in hashtags if hashtag != ","]
most_common_hashtags = Counter(hashtags).most_common(10)

print ("Top 10 Hashtags in Topic 1: ")
print('Topic words :', ldamodel_Tesla.print_topic(0)) 

print ('='* 60)
rslt = pd.DataFrame(most_common_hashtags, columns=['Hashtag', 'Count']).set_index('Hashtag')
print(rslt)
print ('='* 60)

Top 10 Hashtags in Topic 1: 
Topic words : 0.072*"tesla" + 0.047*"puerto" + 0.045*"rico" + 0.032*"powerpacks" + 0.027*"reportedly" + 0.023*"shipped" + 0.008*"trump" + 0.008*"elon" + 0.007*"donald" + 0.007*"shipping"
             Count
Hashtag           
Tesla           41
tesla            8
business         5
ElonMusk         5
PuertoRico       5
science          4
powerwall        4
feedly           4
TeslaModel3      4
Model3           4


In [37]:
hashtags = []
for i in df2.hashtags :
    tokens = nltk.word_tokenize(i) 
    hashtags.extend(tokens)
hashtags = [hashtag for hashtag in hashtags if hashtag != ","]
most_common_hashtags = Counter(hashtags).most_common(10)

print ("Top 10 Hashtags in Topic 2: ")
print('Topic words :', ldamodel_Tesla.print_topic(1)) 

print ('='* 60)
rslt = pd.DataFrame(most_common_hashtags, columns=['Hashtag', 'Count']).set_index('Hashtag')
print(rslt)
print ('='* 60)

Top 10 Hashtags in Topic 2: 
Topic words : 0.049*"tesla" + 0.011*"car" + 0.011*"time" + 0.010*"company" + 0.010*"work" + 0.010*"technology" + 0.009*"nothing" + 0.009*"waste" + 0.009*"hard" + 0.009*"marvel"
             Count
Hashtag           
Tesla           14
ElectricGT       3
TeslaModelS      3
techAU           3
motorsport       3
lawyer           3
legaljobs        3
teslamodels      3
debate           3
Dominica         2


In [38]:
hashtags = []
for i in df3.hashtags :
    tokens = nltk.word_tokenize(i) 
    hashtags.extend(tokens)
hashtags = [hashtag for hashtag in hashtags if hashtag != ","]
most_common_hashtags = Counter(hashtags).most_common(10)

print ("Top 10 Hashtags in Topic 3: ")
print('Topic words :', ldamodel_Tesla.print_topic(2)) 

print ('='* 60)
rslt = pd.DataFrame(most_common_hashtags, columns=['Hashtag', 'Count']).set_index('Hashtag')
print(rslt)
print ('='* 60)

Top 10 Hashtags in Topic 3: 
Topic words : 0.069*"tesla" + 0.016*"hundreds" + 0.009*"model" + 0.009*"workers" + 0.007*"people" + 0.006*"solar" + 0.006*"city" + 0.005*"firing" + 0.005*"employees" + 0.005*"spacex"
             Count
Hashtag           
Tesla           45
Model3           8
model3           8
tesla            7
smallcap         7
tech             5
electriccar      4
China            4
AI               4
PuertoRico       4
