In [None]:
import tweepy
import pandas as pd
import numpy as np

In [None]:
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import sys
sys.path

In [None]:
sys.path.append('D:\\Analytics\\Python ML\\Twitter Data')

In [None]:
sys.path

# 1. Creating a Twitter App

In order to extract tweets for a posterior analysis, we need to access to our Twitter account and create an app. The website to do this is https://apps.twitter.com/. (If you don't know how to do this, you can follow this tutorial video to create an account and an application.)

From this app that we're creating we will save the following information in a script called credentials.py:

Consumer Key (API Key)
Consumer Secret (API Secret)
Access Token
Access Token Secret

The reason of creating this extra file is that we want to export only the value of this variables, but being unseen in our main code (our notebook). We are now able to consume Twitter's API. In order to do this, we will create a function to allow us our keys authentication. We will add this function in another cell of code and we will run it:

In [None]:
# We import our access keys:
from api_keys import * # This will allow us to use the keys as variables

In [None]:
# API's setup:
def twitter_setup():
    """
    Utility function to setup the Twitter's API
    with our access keys provided.
    """
    # Authentication and access using keys:
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

    # Return API with authentication:
    api = tweepy.API(auth)
    return api

# 2. Tweets extraction

Now that we've created a function to setup the Twitter API, we can use this function to create an "extractor" object. After this, we will use Tweepy's function extractor.user_timeline(screen_name, count) to extract from screen_name's user the quantity of count tweets.

As it is mentioned in the title, I've chosen @elonmusk as the user to extract data for a posterior analysis.

In [None]:
# We create an extractor object:
extractor = twitter_setup()

# We create a tweet list as follows:
tweets = extractor.user_timeline(screen_name='IncomeTaxIndia', count=200) #Here the max-tweets, we can get are 200, because of this particular API
print('No of tweets extracted: ',len(tweets))

In [None]:
#We are printing the 1st 5 recent tweets + with attributes of every tweet
for tweet in tweets[:5]:
    print(tweet)

In [None]:
#We are printing the 1st 5 recent tweets + with their text as attributes of every tweet
for tweet in tweets[:5]:
    print(tweet.text,'\n')

# We now have an extractor and extracted data, which is listed in the tweets variable. I must mention at this point that each element in that list is a tweet object from Tweepy

In [None]:
#type of tweets:
type(tweets)

# 3. Creating a (pandas) DataFrame

We now have initial information to construct a pandas DataFrame, in order to manipulate the info in a very easy way.

IPython's display function plots an output in a friendly way, and the headmethod of a dataframe allows us to visualize the first 5 elements of the dataframe (or the first number of elements that are passed as an argument).

In [None]:
data = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets'])
data

In [None]:
data.head()

In [None]:
# We display the first 10 elements of the dataframe:
display(data.head())

In [None]:
# Internal methods of a single tweet object:
print(dir(tweets[0]))

#print(dir(tweets)) - this will tell us the internal methods of a list as --tweets-- is list 

# The interesting part from here is the quantity of metadata contained in a single tweet. If we want to obtain data such as the creation date, or the source of creation, we can access the info with this attributes. 

In [None]:
print('Metadata of the recent Tweets, mean at zero index of tweets object:\n')
print('Tweet:',tweets[0].text)
print('Id:',tweets[0].id)
print('Created At:',tweets[0].created_at)
print('Source:',tweets[0].source)
print('Like:',tweets[0].favorite_count)
print('Retweets:',tweets[0].retweet_count)
print('Geo:',tweets[0].geo)
print('Coordinates:',tweets[0].coordinates)
print('Entities:',tweets[0].entities)

# 4. Adding relevant info to our dataframe

As we can see, we can obtain a lot of data from a single tweet. But not all this data is always useful for specific stuff. In our case we well just add some data to our dataframe. For this we will use Pythons list comprehension and a new column will be added to the dataframe by just simply adding the name of the content between square brackets and assign the content. The code goes as...:

In [None]:
#for deleting the columns and their samples of a Dataframe
# del data['len'],data['ID'],data['Date'],data['Source'],data['Likes'],data['Retweets']
# data

In [None]:
#Adding new columns in the dataframe
# We add relevant data:
data['Len'] = np.array([len(tweet.text) for tweet in tweets])
data['ID'] = np.array([tweet.id for tweet in tweets])
data['Date'] = np.array([tweet.created_at for tweet in tweets])
data['Source'] = np.array([tweet.source for tweet in tweets])
data['Likes'] = np.array([tweet.favorite_count for tweet in tweets])
data['Retweets'] = np.array([tweet.retweet_count for tweet in tweets])
data

Now that we have extracted and have the data in a easy-to-handle ordered way, we're ready to do a bit more of manipulation to visualize some plots and gather some statistical data.

# 5. Visualization and basic statistics

1. Averages and popularity

We first want to calculate some basic statistical data, such as the mean of the length of characters of all tweets, the tweet with more likes and retweets, etc.

In [None]:
mean = np.mean(data['Len'])
print("The lenght's average in tweets:",mean)

In [None]:
# We extract the tweet with more FAVs and more RTs:
fav_max = np.max(data['Likes'])
rt_max = np.max(data['Retweets'])

#here, index[0] of dataframe i.e data is Tweets
fav_tweet = data[data.Likes == fav_max].index[0]
rt_tweet = data[data.Retweets == rt_max].index[0]

# Max FAVs: + formatting
print("The tweet with more likes is: \n{}".format(data['Tweets'][fav_tweet]))
print("Number of likes: {}".format(fav_max))
print("{} characters.\n".format(data['Len'][fav_tweet]))

# Max RTs: + formatting
print("The tweet with more retweets is: \n{}".format(data['Tweets'][rt_tweet]))
print("Number of retweets: {}".format(rt_max))
print("{} characters.\n".format(data['Len'][rt_tweet]))

This is common, but it won't necessarily happen: the tweet with more likes is the tweet with more retweets. What we're doing is that we find the maximum number of likes from the 'Likes' column and the maximum number of retweets from the 'RTs' using numpy's max function. With this we just look for the index in each of both columns that satisfy to be the maximum. Since more than one could have the same number of likes/retweets (the maximum) we just need to take the first one found, and that's why we use .index[0] to assign the index to the variables favand rt. To print the tweet that satisfies, we access the data in the same way we would access a matrix or any indexed object.

# 6. Time series

Pandas has its own object for time series. Since we have a whole vector with creation dates, we can construct time series respect tweets lengths, likes and retweets.

In [None]:
# We create time series for data:
tlen = pd.Series(data=data['Len'].values, index=data['Date'])
tfav = pd.Series(data=data['Likes'].values, index=data['Date'])
trt = pd.Series(data=data['Retweets'].values, index=data['Date'])

And if we want to plot the time series, pandas already has its own method in the object. We can plot a time series as follows:

In [None]:
#y-axis: Tweet's length & x-axis: Tweet's Date
tlen.plot(figsize=(16,4), color='r')

In [None]:
tfav.plot(figsize=(16,4), label='Likes', legend=True)
trt.plot(figsize=(16,4), label='Retweets', legend=True)

# 7. Pie charts of sources

We're almost done with this second section of the post. Now we will plot the sources in a pie chart, since we realized that not every tweet is tweeted from the same source

In [None]:
# We obtain all possible sources:
sources = []
for source in data['Source']:
    if source not in sources:
        sources.append(source)
        
# We print sources list:
print('The distinct sources of tweets are:\n')
i=0
for source in sources:
    i+=1
    print(str(i)+'. '+source)
        

We now count the number of each source and create a pie chart.

In [None]:
# We create a numpy vector mapped to labels:
#Here, a percent 1D array is created which will store percentage of distincts tweets sources.
#As we are providing len(sources), which will give a number and np.zeros() will create an array.
percent = np.zeros(len(sources))
percent

In [None]:
for source in data['Source']:
    for index in range(len(sources)):
        if source == sources[index]:
            percent[index] += 1
            pass
        
percent /= 100
percent

In [None]:
# Pie chart:
#autopct enables you to display the percent value using Python string formatting. For example, if autopct='%.2f', 
#then for each pie wedge, the format string is '%.2f' 
#and the numerical percent value for that wedge is pct, so the wedge label is set to the string '%.2f'%pct
pie_chart = pd.Series(percent, index=sources, name='Sources')
pie_chart.plot.pie(fontsize=11, autopct='%.2f', figsize=(7,7))

we can see the percentage of tweets per source.

# 8. Sentiment analysis

As we mentioned at the beginning of this post, textblob will allow us to do sentiment analysis in a very simple way. We will also use the re library from Python, which is used to work with regular expressions. For this, I'll provide you two utility functions to: a) clean text (which means that any symbol distinct to an alphanumeric value will be remapped into a new one that satisfies this condition), and b) create a classifier to analyze the polarity of each tweet after cleaning the text in it.

In [None]:
from textblob import TextBlob
import re

# A) Removing Twitter Handles (@user)

As mentioned above, the tweets contain lots of twitter handles (@user), that is how a Twitter user acknowledged on Twitter. We will remove all these twitter handles from the data as they don’t convey much information.

# B) Removing Punctuations, Numbers, and Special Characters

As discussed, punctuations, numbers and special characters do not help much. It is better to remove them from the text just as we removed the twitter handles. Here we will replace everything except characters and hashtags with spaces.

# C) Others

1. Removing Short Words - We have to be a little careful here in selecting the length of the words which we want to remove. So, I have decided to remove all the words having length 3 or less. For example, terms like “hmm”, “oh” are of very little use. It is better to get rid of them.

Tokenization  - Now we will tokenize all the cleaned tweets in our dataset. Tokens are individual terms or words, and tokenization is the process of splitting a string of text into tokens.

2. Stemming - Stemming is a rule-based process of stripping the suffixes (“ing”, “ly”, “es”, “s” etc) from a word. For example, For example – “play”, “player”, “played”, “plays” and “playing” are the different variations of the word – “play”.

In [None]:
def clean_tweet(tweet):
    '''
    Utility function to clean the text in a tweet by removing 
    links and special characters using regex.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

In [None]:
def analyze_sentiment(tweet):
    '''
    Utility function to classify the polarity of a tweet
    using textblob.
    '''
    analysis = TextBlob(clean_tweet(tweet))
    if analysis.sentiment.polarity > 0:
        return 1 #+ve polarity => +ve characteristics
    elif analysis.sentiment.polarity == 0:
        return 0 #neutral polarity => neutral characteristics
    else:
        return -1 #-ve polarity => -ve characteristics

The way it works is that textblob already provides a trained analyzer (cool, right?). Textblob can work with different machine learning models used in natural language processing. If you want to train your own classifier (or at least check how it works) feel free to check the following link [https://textblob.readthedocs.io/en/dev/classifiers.html]. It might result relevant since we're working with a pre-trained model (for which we don't not the data that was used).

In [None]:
data['Sentiment-Analysis'] = np.array([analyze_sentiment(tweet) for tweet in data['Tweets']])
data

# 9. Analyzing the results

To have a simple way to verify the results, we will count the number of neutral, positive and negative tweets and extract the percentages.

In [None]:
# We construct lists with classified tweets:
pos_tweets = [tweet for index,tweet in enumerate(data['Tweets']) if data['Sentiment-Analysis'][index] > 0]
neu_tweets = [tweet for index,tweet in enumerate(data['Tweets']) if data['Sentiment-Analysis'][index] == 0]
neg_tweets = [tweet for index,tweet in enumerate(data['Tweets']) if data['Sentiment-Analysis'][index] < 0]

Now that we have the lists, we just print the percentages:

In [None]:
# We print percentages:
print('Percentage of Positive Tweets: ',int(len(pos_tweets))*100/int(len(data['Tweets'])))
print('Percentage of Neutral Tweets: ',int(len(neu_tweets))*100/int(len(data['Tweets'])))
print('Percentage of Negative Tweets: ',int(len(neg_tweets))*100/int(len(data['Tweets'])))

We have to consider that we're working only with the 200 most recent tweets from IncomeTaxIndia. For more accurate results we can consider more tweets. An interesting thing (an invitation to the readers) is to analyze the polarity of the tweets from different sources, it might be deterministic that by only considering the tweets from one source the polarity would result more positive/negative. Anyway, I hope this resulted interesting.

As we saw, we can extract, manipulate, visualize and analyze data in a very simple way with Python. I hope that this leaves some uncertainty in the reader, for further exploration using this tools.

# Story Generation and Visualization from Tweets

In this section, we will explore the cleaned tweets text. Exploring and visualizing data, no matter whether its text or any other data, is an essential step in gaining insights. Do not limit yourself to only these methods told in this tutorial, feel free to explore the data as much as possible.

Before we begin exploration, we must think and ask questions related to the data in hand. A few probable questions are as follows:

1. What are the most common words in the entire dataset?
2. What are the most common words in the dataset for negative and positive tweets, respectively?
3. How many hashtags are there in a tweet?
4. Which trends are associated with my dataset?
5. Which trends are associated with either of the sentiments? Are they compatible with the sentiments?

In [None]:
all_words = ' '.join([tweet for tweet in data['Tweets']])
clean_all_words = clean_tweet(all_words)
print(clean_all_words)

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(clean_all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# 1. Words in +ve sentiment tweets

In [None]:
pos_words = ' '.join([tweet for tweet in data['Tweets'][data['Sentiment-Analysis'] == 1]])
clean_pos_words = clean_tweet(pos_words)
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(clean_pos_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# 2. Words in neutral sentiment tweets

In [None]:
neu_words = ' '.join([tweet for tweet in data['Tweets'][data['Sentiment-Analysis'] == 0]])
clean_neu_words = clean_tweet(neu_words)
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(clean_neu_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# 3. Words in -ve sentiment tweets

In [None]:
neg_words = ' '.join([tweet for tweet in data['Tweets'][data['Sentiment-Analysis'] == -1]])
clean_neg_words = clean_tweet(pos_words)
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(clean_neg_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()