## Streaming of Tweets from Twitter Api and performe classification
This example uses text of tweets, user description and hashtags

In [1]:
import time
from pyspark.streaming import StreamingContext
from pyspark.mllib.classification import LogisticRegressionModel
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer
from pyspark.sql import Row
from pyspark.mllib.linalg import SparseVector
import twitter
import dateutil.parser
import json
from pyspark.sql import functions as funs
import numpy as np

In [2]:
def clean_dataframe(input_df):
  onlywords_df = input_df.select(funs.lower(
      funs.regexp_replace(
        input_df.text,'(\^|\?|\!|\$|\(|\)|\\b\\w{1}\\b\\s?|\"|\'|RT|:|\.|@[^\s]+|http[^\s]+|#|[0-9]+)','')).alias('text'),'id','hashtags')
  return onlywords_df.select(funs.trim(funs.regexp_replace(onlywords_df.text,'(\+|_|-|\,|\s+)',' ')).alias('text'),'id','hashtags')

In [3]:
def combine_hash_text(text,l):
  return text+' '+' '.join(l)

In [4]:
class Tweet(dict):
  def __init__(self, tweet_in):
    super(Tweet, self).__init__(self)
    if tweet_in and tweet_in['lang']=='en':
      self['timestamp'] = dateutil.parser.parse(tweet_in[u'created_at']).replace(tzinfo=None).isoformat()
      self['text'] = tweet_in['text']
      self['hashtags'] = []
      for hash in tweet_in['entities']['hashtags']:
        self['hashtags'].append(hash['text'])
      self['urls'] = [x for x in tweet_in['entities']['urls']]
      self['id'] = tweet_in['id']
      self['screen_name'] = tweet_in['user']['screen_name'].encode('utf-8')
      self['user_description'] = tweet_in['user']['description']
      self['user_id'] = tweet_in['user']['id']
    else:
      self['timestamp'] = ''
      self['text'] = 'No TEXT'
      self['hashtags'] = []
      self['urls'] = []
      self['id'] = tweet_in['id']
      self['screen_name'] = tweet_in['user']['screen_name']
      self['user_description'] = ''
      self['user_id'] = ''

In [5]:
def connect_twitter():
  twitter_stream = twitter.TwitterStream(auth=twitter.OAuth(
  token = "...",
  token_secret = "...",
  consumer_key = "...",
  consumer_secret = "..."))
  return twitter_stream

In [6]:
def get_next_tweet(twitter_stream):
  #stream = twitter_stream.statuses.sample(block=True)
  stream = twitter_stream.statuses.filter(track="#universalchampionship",lang='en')
  tweet_in = None
  while not tweet_in or 'delete' in tweet_in:
    tweet_in = stream.next()
  tweet_parsed = Tweet(tweet_in)
  print tweet_parsed
  return json.dumps(tweet_parsed)

In [7]:
def getFeatures(time,rdd):
  input_df = (rdd.map(lambda json_str:(json.loads(json_str))).map(lambda r:Row(r['text'],r['id'],r['hashtags'],r['user_description']))).toDF(['text','id','hashtags','user_description'])
  
  combined_df = (input_df.rdd
              .map(lambda row:[row['id'],combine_hash_text(row['text'],row['hashtags'])+' '+row['user_description'],row['hashtags']])).toDF(['id','text','hashtags'])
  
  cleaned_df = clean_dataframe(combined_df)
  
  cleaned_df.show()
  
  tokenizer = RegexTokenizer(inputCol="text", outputCol="words")
  hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
  
  hashingTF.setNumFeatures(1600)
  tokenized = tokenizer.transform(cleaned_df)
  hashed = hashingTF.transform(tokenized)
  
  hashed.show()
  return hashed.rdd
  
  #features = feature_extraction_with_hash(cleaned_df,400,1900)
  #features.show()
  #return features.rdd
  
  
def process_rdd_queue(twitter_stream,lr_model):
  # Create the queue through which RDDs can be pushed to
  # a QueueInputDStream
  rddQueue = []
  
  '''
  Wait 5 tweet then finish
  '''
  for i in range(1):
    rddQueue +=[ssc.sparkContext.parallelize([get_next_tweet(twitter_stream)], 5)]
  
  #(ssc.queueStream(rddQueue).transform(getFeatures).transform(lambda _, rdd:rdd).map(lambda row:(lr_model.predict(row['features']),row['text']))).pprint()
  #lines.pprint()
  (ssc.queueStream(rddQueue).transform(getFeatures).transform(lambda _, rdd:rdd).map(lambda row:(lr_model.predict(row['features']),row['text']))).pprint()
  

In [8]:
ssc = StreamingContext(sc, 1)
# Instantiate the twitter_stream
twitter_stream = connect_twitter()
# Get RDD queue of the streams json or parsed
filename_model = '/FileStore/models/lr_29_8.lrm'
lr_model = LogisticRegressionModel.load(sc,filename_model)
lr_model.clearThreshold()
process_rdd_queue(twitter_stream,lr_model)
ssc.start()
time.sleep(10)

In [9]:
ssc.stop(stopSparkContext=True, stopGraceFully=True)