In [18]:
%matplotlib
import matplotlib.pyplot as plt
import json

TWEETS_DATA_PATH = "tweet_mining.json"

results = []
with open(TWEETS_DATA_PATH) as tweet_file:
    for tweet_line in tweet_file:
        try:
            status = json.loads(tweet_line)
            results.append(status)
        except ValueError:
            pass
        
print(len(results))

Using matplotlib backend: MacOSX
500


In [19]:
import pandas

# create a DataFrame
statuses = pandas.DataFrame()

# store the tweet text values
statuses['text'] = [status['text'] for status in results]
# store the tweet language values
statuses['lang'] = [status['lang'] for status in results]
# store the tweet place values, or "N/A" if none
statuses['country'] = [status['place']['country'] if status['place'] else "N/A" for status in results]

print(statuses.head())

                                                text lang country
0  As a kdrama fan sa lahat nang napanood ko lagi...   tl     N/A
1  RT @RejiYates: @MikezAFC_ Happy Birthday, have...   en     N/A
2  RT @ChinaPlusNews: Opinion: The EU and China s...   en     N/A
3  RT @godblessbrendon: happy #OnlineFriendApprec...   en     N/A
4  RT @itz_amarah: Hey there,\n\nDo you know anyo...   en     N/A


In [20]:
# get each tweet language and the count of its appearance
tweets_by_lang = statuses['lang'].value_counts()
# get each tweet country and the count of its appearance
tweets_by_country = statuses['country'].value_counts()

print(tweets_by_lang)
print(tweets_by_country)

en     452
tl      13
ja      11
und      7
es       4
th       4
ko       3
in       2
tr       1
pt       1
ta       1
fr       1
Name: lang, dtype: int64
N/A               493
United Kingdom      4
Nederland           1
United States       1
South Africa        1
Name: country, dtype: int64


In [21]:
# create out drawing space (figure)
fig = plt.figure()
fig.subplots_adjust(hspace=.9)

# prepare to plot two charts on the same figure
ax1 = fig.add_subplot(2,1,1)
ax2 = fig.add_subplot(2,1,2)

# style the axes
ax1.tick_params(axis='x', labelsize=15)
ax1.tick_params(axis='y', labelsize=10)
ax1.set_xlabel('Tweet languages', fontsize=15)
ax1.set_xlabel('Number of tweets', fontsize=15)
ax1.xaxis.label.set_color('#666666')
ax1.yaxis.label.set_color('#666666')
ax1.tick_params(axis='x', colors='#666666')
ax1.tick_params(axis='y', colors='#666666')

# style the title
ax1.set_title('Top 10 languages', fontsize=15, color='#aaaaaa')

# plot the top 10 tweet languages and count using a bar chart
tweets_by_lang[:10].plot(ax=ax1, kind='bar', color='#FF7A00')

# color the spines (border)
for spine in ax1.spines.values():
    spine.set_edgecolor('#666666')
    
# Second subplot
ax2.tick_params(axis='x', labelsize=15)
ax2.tick_params(axis='y', labelsize=10)
ax2.set_xlabel('Tweet Countries', fontsize=15)
ax2.set_xlabel('Number of tweets', fontsize=15)
ax2.xaxis.label.set_color('#666666')
ax2.yaxis.label.set_color('#666666')
ax2.tick_params(axis='x', colors='#666666')
ax2.tick_params(axis='y', colors='#666666')

# style the title
ax2.set_title('Top 10 Countries', fontsize=15, color='#aaaaaa')

# plot the top 10 tweet countries and count using a bar chart
tweets_by_country[:10].plot(ax=ax2, kind='bar', color='#FF7A00')

# color the spines (border)
for spine in ax2.spines.values():
    spine.set_edgecolor('#666666')
    
# render the two graphs at once
plt.show()