In [1]:
from __future__ import print_function
%matplotlib inline
import matplotlib.pylab as plt
import sys, os, glob
import numpy as np

# set some nicer plotting options
plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams['font.size'] = 18
plt.style.use('fivethirtyeight')

In [2]:
from pyspark.sql import SQLContext

In [3]:
sqc = SQLContext(sc)

In [4]:
%%time
data = sqc.read.parquet('/user/roskarr/twitter/2014_10')

CPU times: user 4 ms, sys: 3 ms, total: 7 ms
Wall time: 9.93 s


In [27]:
from pyspark.sql.functions import instr, regexp_extract, size, to_date, locate
from pyspark.sql.functions import udf, date_format, to_date, from_utc_timestamp, sumDistinct, desc, denseRank
import pyspark.sql.functions as func
from pyspark.sql import Window
from pyspark.sql.types import ArrayType, StringType, StructField, StructType, DateType, DataType, DateConverter, DatetimeConverter, TimestampType

In [6]:
from datetime import datetime

In [7]:
def number_of_hashtags(row):
    if row is not None:
        return len(row)
    else:
        return 0

num_hashtags = udf(number_of_hashtags)

In [8]:
import time

In [9]:
import datetime

In [10]:
convert_date_string = udf(lambda date_string: datetime.date.strftime(datetime.datetime.strptime(date_string, '%a %b %d %H:%M:%S +0000 %Y'),'%Y-%m-%d %H:%M:%S'), StringType())

In [11]:
# make UDF for converting the date string to a datetime object
datetime_udf = udf(lambda date_string: datetime.strptime(date_string, '%a %b %d %H:%M:%S +0000 %Y'), DateType())

In [12]:
# register a UDF to extract a list of hashtags
hash_text_udf = udf(lambda row: [r.text for r in row], returnType=ArrayType(StringType()))

In [13]:
# only keep the tweets with at least one hashtag
hashtag_df = (data.select('created_at', 'entities.hashtags')
                .filter(num_hashtags('hashtags') > 0)
                .withColumn('hash_text', hash_text_udf('hashtags'))
                .select(convert_date_string('created_at').alias('date'), 'hash_text')
                .repartition(1200))
hashtag_df.cache()

DataFrame[date: string, hash_text: array<string>]

In [14]:
hashtag_df.select(from_utc_timestamp('date', 'utc').alias('timedate'), 'hash_text').first()

Row(timedate=datetime.datetime(2014, 10, 6, 8, 53, 37), hash_text=[u'\u062a\u0637\u0628\u064a\u0642_\u0642\u0631\u0622\u0646\u0649'])

In [15]:
flat_hash = (hashtag_df.select(from_utc_timestamp('date', 'utc').alias('timedate'), 'hash_text')
                       .flatMap(lambda row: [(row.timedate.timetuple().tm_yday, hashtag) for hashtag in row.hash_text])
                       .toDF(['timedate', 'hashtags']))

In [16]:
day_counts = (flat_hash.groupby('timedate', 'hashtags')
                       .count())

In [20]:
day_counts.sort('timedate', desc('count')).show()

+--------+--------------------+-----+
|timedate|            hashtags|count|
+--------+--------------------+-----+
|     274|               رتويت| 9469|
|     274|        KCAArgentina| 8811|
|     274|         gameinsight| 8695|
|     274|        androidgames| 4937|
|     274|              الهلال| 4818|
|     274|             android| 4818|
|     274|الهلال_إلى_نهائي_...| 4114|
|     274|                  RT| 3888|
|     274|              الرياض| 3611|
|     274|      متي_تصير_نفسيه| 3372|
|     274|              相互フォロー| 3348|
|     274|       RTした人全員フォローする| 3130|
|     274|                MGWV| 3070|
|     274|            السعودية| 3028|
|     274|      TeamFollowBack| 2956|
|     274|         sougofollow| 2826|
|     274|           ipadgames| 2796|
|     274|لو_يخلون_جنسيتك_ع...| 2793|
|     274|                porn| 2765|
|     274|     เกาหลีใต้ขี้โกง| 2557|
+--------+--------------------+-----+
only showing top 20 rows



In [56]:
daily_window = Window.partitionBy('timedate').orderBy(desc('count'))

In [57]:
daily_rank = func.rank().over(daily_window)

In [45]:
day_counts.select('timedate', 'count', 'hashtags', daily_rank)

AnalysisException: Could not resolve window function 'rank'. Note that, using window functions currently requires a HiveContext;

In [58]:
hc = HiveContext(sc)

In [52]:
hc.createDataFrame(day_counts.rdd)

DataFrame[timedate: bigint, hashtags: string, count: bigint]

In [53]:
df2 = Out[52]

In [63]:
df2.select('timedate', 'count', 'hashtags', daily_rank).filter(df2.timedate == 275).show()

+--------+-----+----------------+---------------------------------------------+
|timedate|count|        hashtags|'rank() WindowSpecDefinition UnspecifiedFrame|
+--------+-----+----------------+---------------------------------------------+
|     275|13777|    KCAArgentina|                                            1|
|     275|13246|           رتويت|                                            2|
|     275|12278|     gameinsight|                                            3|
|     275| 6432|         android|                                            4|
|     275| 6388|    androidgames|                                            5|
|     275| 6355|      1DProposal|                                            6|
|     275| 5558|   NashsNewVideo|                                            7|
|     275| 5263|          الهلال|                                            8|
|     275| 4865|              RT|                                            9|
|     275| 4845|          ريتويت|       