In [1]:
from __future__ import print_function
%matplotlib inline
import matplotlib.pylab as plt
import sys, os, glob
import numpy as np

In [2]:
plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams['font.size'] = 18
plt.style.use('fivethirtyeight')

In [3]:
from pyspark.sql import SQLContext

In [4]:
sqc = SQLContext(sc)

In [5]:
%%time
data = sqc.read.parquet('/user/roskarr/twitter/2014_10')

CPU times: user 5 ms, sys: 0 ns, total: 5 ms
Wall time: 5.95 s


In [6]:
%time data.cache().count()

CPU times: user 36 ms, sys: 4 ms, total: 40 ms
Wall time: 3min 28s


162751019

In [154]:
from pyspark.sql.functions import instr, regexp_extract, size, to_date, locate
from pyspark.sql.functions import udf

In [134]:
def number_of_hashtags(row):
    if row is not None:
        return len(row)
    else:
        return 0

num_hashtags = udf(number_of_hashtags)

In [142]:
hashtags = data.select('created_at', 'entities.hashtags').filter(num_hashtags('hashtags') > 0)

In [143]:
hashtags.take(10)

[Row(created_at=u'Wed Oct 01 10:38:00 +0000 2014', hashtags=[Row(indices=[9, 19], text=u'WengerOut')]),
 Row(created_at=u'Wed Oct 01 10:38:00 +0000 2014', hashtags=[Row(indices=[0, 10], text=u'CRISEnaPF'), Row(indices=[27, 33], text=u'Dilma')]),
 Row(created_at=u'Wed Oct 01 10:38:00 +0000 2014', hashtags=[Row(indices=[35, 52], text=u'RT\u3057\u305f\u4eba\u306b\u3076\u3093\u6295\u3052\u305f\u3044\u7269\u3092\u8a00\u3046')]),
 Row(created_at=u'Wed Oct 01 10:38:00 +0000 2014', hashtags=[Row(indices=[0, 7], text=u'\uc0dd\ubc29\uc1a1\uce74\uc9c0\ub178'), Row(indices=[43, 56], text=u'\ubc14\uce74\ub77c\uc0ac\uc774\ud2b8\uc628\ub77c\uc778\ubc14\uce74\ub77c')]),
 Row(created_at=u'Wed Oct 01 10:38:00 +0000 2014', hashtags=[Row(indices=[83, 99], text=u'\u30c9\u30af\u30bf\u30fc\u306b\u8a00\u308f\u308c\u305f\u885d\u6483\u7684\u306a\u8a00\u8449')]),
 Row(created_at=u'Wed Oct 01 10:38:00 +0000 2014', hashtags=[Row(indices=[0, 11], text=u'\uc6d4\ub4dc\ub77c\uc774\ube0c\ubc14\uce74\ub77c\uac8c\uc784')

In [141]:
def hashtag_text(row):
    return " ".join([r.text for r in row])
hash_text_udf = udf(hashtag_text)

In [169]:
hash_count = (hashtags.withColumn('hash_text', hash_text_udf('hashtags'))
         .filter(instr('hash_text', '\\') != 1)
         .select('hash_text')
         .rdd
         .flatMap(lambda row: [(word, 1) for word in row.hash_text.split(' ')])).reduceByKey(lambda a,b: a+b)

In [172]:
for hashtag, count in hash_count.sortBy(lambda (k,v): v, ascending=False).take(100):
    print(hashtag, count)

EMABiggestFans1D 997652
EMABiggestFansJustinBieber 966601
KCAArgentina 736812
gameinsight 366820
تطبيق_قرآنى 282558
androidgames 183913
android 179361
كنز_المسلم 173564
رتويت 162817
RT 161550
الهلال 154979
相互フォロー 143476
RTした人全員フォローする 137458
sougofollow 135996
ipadgames 125524
الرياض 120912
porn 116836
السعودية 116091
ipad 114143
TeamFollowBack 113479
GonzaloHiguain 109841
LaliEsposito 109672
MGWV 108014
النصر 88614
FOLLOWTRICK 84330
RETWEET 83261
TFB 82303
sex 78239
BabyOneMoreTime 77890
FF 77640
openfollow 75209
TFBJP 75153
سكس 69524
拡散希望 66190
Android 64966
OrianaSabatini 62640
Love 61172
JulianSerrano 58967
iphone 57469
AMAs 57444
ريتويت 56489
ANDROID 55906
followme 54173
Wannabe 52066
TEAMFOLLOWBACK 52050
MercedesLambre 51999
EMABiggestFansArianaGrande 51817
FOLLOWBACK 50667
الاتحاد 50412
FOLLOW 49931
news 49307
PeterLanzani 48396
JorgeBlanco 47379
job 47073
VoteVampsVevo 45738
followback 45397
xxx 44745
teen 44725
طيز 44052
Follow 43651
iPad 43012
TuitUtil 42990
nowplaying 42606
ج