In [1]:
#Question
#1. Word Count

#(1) Use the tweets you collected in Problem 1, and compute the frequencies of the words being used in these tweets.
#(2) Plot a table of the top 30 words with their counts 

#2. Find the most popular Tweet Entities in your collection of tweets

#(1) plot a table of the top 10 hashtags, 
#(2) top 10 user mentions that are the most popular in your collection of tweets.
 

In [1]:
%matplotlib inline
import json
import pandas as pd
from IPython.display import display
from collections import Counter
import matplotlib.pyplot as plt
import re
pd.set_option('display.mpl_style', 'default') #Pretty print graph

#start pre process_tweet  
def preprocesstweet(tweet):        
    # process the tweets
    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',tweet)
    #trim
    tweet = tweet.strip('\'"')
    return tweet

#start getStopWordList
def getStopWordList(stopWordListFileName):
    #Stop word list source : 
    #https://github.com/ravikiranj/twitter-sentiment-analyzer/blob/master/data/feature_list/stopwords.txt        
    #read the stopwords file and build a list
    stopWords = []
    stopWords.append('url')        

    fp = open(stopWordListFileName, 'r')
    line = fp.readline()
    while line:
        word = line.strip()
        stopWords.append(word)
        line = fp.readline()
    fp.close()
    return stopWords

#Function to convert JSON data to panda Datasets
def jsonto_PandaDataFrame(tweets):
    import pandas as pd
    #Create a instance of Panda Dataframe
    DataSet = pd.DataFrame()    
    #print tweets
    for item in tweets:
        DataSet['word'] = [l[0] for l in tweets]
        DataSet['Count'] = [l[1] for l in tweets]
    return DataSet

def print_frequency_count(item_list):    
    c = Counter(item_list)
    result = c.most_common()[:500]
    #DataSet = jsonto_PandaDataFrame(result)
    return result

def get_word_frequency(tweets_data):
    #Get the words filtered with stop words and process the frequency.
    stopWords = getStopWordList('StopWordList.txt')
    status_texts = [ status['text'] 
                    for status in tweets_data ]

    screen_names = [ user_mention['screen_name']
                    for status in tweets_data
                        for user_mention in status['entities']['user_mentions'] ]

    hashtags = [ hashtag['text'] 
             for status in tweets_data
                 for hashtag in status['entities']['hashtags'] ]

    # Compute a collection of all words from all tweets
    words = [ w.lower()
             for t in status_texts                  
              for w in preprocesstweet(t).split() if not w.startswith(('#', '@')) if w.lower() not in stopWords ]

    return words,hashtags,screen_names


def plot_graph(tzs):
    # Create a bar-graph figure of the specified size
    plt.rcParams['figure.figsize'] = (15, 5)
    # Plot the Time Zone data as a bar-graph
    tzs.plot(kind='bar')
    # Assign labels and title to the graph to make it more presentable
    plt.xlabel('Words')
    plt.ylabel('Count')
    plt.title('Top 10 retweets')
    
fls = ['condolence.json','R_I_P.json','RIP.json','restInPeace.json']
RT_tweets = []
total_tweets = 0
total_words,total_ht,total_sn = [],[],[]
for f in fls: 
    tweets_data = []
    print "Analysing Data from File - " + f
    tweets_data_path = f #'condolence.json'    
    tweets_file = open(tweets_data_path, "r")
    for line in tweets_file:
        try:
            tweet = json.loads(line)        
            tweets_data.append(tweet)
        except:
            continue
    total_tweets += len(tweets_data)
    words,ht,sn = get_word_frequency(tweets_data)
    w_result = print_frequency_count(words)
    ht_result = print_frequency_count(ht)
    sn_result = print_frequency_count(sn)    
    
    total_words += w_result
    total_ht += ht_result
    total_sn += sn_result

print "Total Tweet Count : "+str(total_tweets)

print "***** PRINTING TOP 30 WORD COUNT *****"
DataSet = jsonto_PandaDataFrame(total_words)
display(DataSet.head(30))

print "***** PRINTING TOP 10 HASHTAG COUNT *****"
DataSet = jsonto_PandaDataFrame(total_ht)
display(DataSet.head(10))

print "***** PRINTING TOP 10 SCREEN NAMES COUNT *****"
DataSet = jsonto_PandaDataFrame(total_sn)
display(DataSet.head(10))
   


Analysing Data from File - condolence.json
Analysing Data from File - R_I_P.json
Analysing Data from File - RIP.json
Analysing Data from File - restInPeace.json
Total Tweet Count : 360011
***** PRINTING TOP 30 WORD COUNT *****


Unnamed: 0,word,Count
0,condolence,79478
1,prayers,13350
2,allah,13200
3,hang.,12900
4,strongly,12900
5,condemn,12900
6,injured,12900
7,people.,12900
8,culprits,12900
9,martyrs,12900


***** PRINTING TOP 10 HASHTAG COUNT *****


Unnamed: 0,word,Count
0,Quetta,12900
1,QuettaBlast,12900
2,ALDUBBoojieWonderLand,9322
3,MarkFarren,3000
4,ALDUBPangalawangPagsubok,2850
5,Saudi,2204
6,Shia,2204
7,TerryWogan,900
8,auspol,900
9,ABSCBNZEROINTEGRITY,750


***** PRINTING TOP 10 SCREEN NAMES COUNT *****


Unnamed: 0,word,Count
0,TanzeelSHK,12900
1,me_gicana28,6636
2,derrycityfc,6450
3,aldenrichards02,2606
4,mainedcm,2606
5,Abunass3r,2204
6,MamuStefie2325,1650
7,EatBulaga,1240
8,ALDUBabaji,1200
9,RajivPratapRudy,1200
