# Twitter #cassandra classification with Spark MLlib

In [1]:
from pyspark.sql import functions as funs
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer
from pyspark.mllib.classification import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.linalg import SparseVector
from pyspark.sql import Row
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.classification import LogisticRegressionModel
import numpy as np

In [2]:
'''
regularization = 0.244, hash = 400, words = 1900 ---- PR-99, ROC -- 97
'''
filename_datastax = '/FileStore/tables/v1a51v731472210317176/data_stax_dataset.json'
filename_bigdata = '/FileStore/tables/v1a51v731472210317176/big_data_dataset.json'
filename_porn = '/FileStore/tables/v1a51v731472210317176/porn_tweets.json'
filename_cass = '/FileStore/tables/v1a51v731472210317176/cassandra_dataset.json'
filename_spark = '/FileStore/tables/v1a51v731472210317176/spark_tweets_dataset.json'
filename_softeng = '/FileStore/tables/v1a51v731472210317176/sweng_dataset.json'
filename_database = '/FileStore/tables/v1a51v731472210317176/database_dataset.json'

In [3]:
# Performe text cleanup
'''
Example: @user RT: This IS a tweet. Hello-there! #hello #hi.
Transforms into: this is tweet hello there hello hi
'''
def clean_dataframe(input_df):
  onlywords_df = input_df.select(funs.lower(
      funs.regexp_replace(
        input_df.text,'(\^|\?|\!|\$|\(|\)|\\b\\w{1}\\b\\s?|\"|\'|RT|:|\.|@[^\s]+|http[^\s]+|#|[0-9]+)','')).alias('text'),'label','id','hashtags')
  return onlywords_df.select(funs.trim(funs.regexp_replace(onlywords_df.text,'(\+|_|-|\,|\s+)',' ')).alias('text'),'label','id','hashtags')

In [4]:
#Tokenization of dataset
'''
"this       is tweet" --> [this, is, tweet]
'''
def tokenization(input_df):
  tokenizer = RegexTokenizer(inputCol="text", outputCol="words")
  return tokenizer.transform(input_df)

In [5]:
'''
Make TF and IDF model from tokenized text
'''
'''
def make_tf_idf(input_df, number_features):
  hashing_tf_model = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=number_features)
  hashed_tf_df = hashing_tf_model.transform(input_df)
  idf = IDF(inputCol="rawFeatures", outputCol="features")
  idf_model = idf.fit(hashed_tf_df)
  return hashing_tf_model,idf_model
'''

In [6]:
'''
Only make TF
'''
def feature_extr_onlytf(input_df,number_features):
  hashing_tf_model = HashingTF(inputCol="words", outputCol="features", numFeatures=number_features)
  return hashing_tf_model.transform(input_df)

In [7]:
# Feature extraction for given input DataFrame and TFIDF models
'''
def feature_extraction(input_df,tf_model,idf_model):
  tf_features = tf_model.transform(input_df)
  tfidf = idf_model.transform(tf_features)
  return tfidf
'''

In [8]:
# Make LabelPoints from DataFrame for training
'''
LabelPoint has two fields. One is label, and second is SparseVector named features
'''
def make_label_points(input_df):
  return input_df.rdd.map(lambda row:LabeledPoint(row['label'],row['features']))

In [9]:
def reg_params(train_labels,cv_test,threshold_start,threshold_end,threshold_step,reg_start,reg_end,reg_step):
  output = []
  regularization = reg_start
  while True:
    threshold = threshold_start
    while True:
      model_lr = LogisticRegressionWithLBFGS()
      model_lr = model_lr.train(train_labels,regParam=regularization)
      model_lr.setThreshold(threshold)
      predictionAndLabels = cv_test.rdd.map(lambda lp: (float(model_lr.predict(lp.features)), float(lp.label)))
      metrics = BinaryClassificationMetrics(predictionAndLabels)
      output.append([regularization,threshold,metrics.areaUnderPR,metrics.areaUnderROC])
      threshold+=threshold_step
      if(threshold>threshold_end):
        break
    
    regularization+=reg_step
    if(regularization>reg_end):
      break
  
  return output

In [10]:
def find_feature_num(train_set,cv_set,start_num,end_num,step):
  output = []
  for i in range(start_num,end_num,step):
    tokenized_train = tokenization(train_set)
    tokenized_cv = tokenization(cv_set)
    #tf_model,idf_model = make_tf_idf(tokenized_train,i)
    #train_tfidf = feature_extraction(train_tokenized,tf_model,idf_model)
    #cv_tfidf = feature_extraction(tokenized_cv,tf_model,idf_model)
    train_tfidf = feature_extr_onlytf(tokenized_train,i)
    cv_tfidf = feature_extr_onlytf(tokenized_cv,i)
    train_labels = make_label_points(train_tfidf)
    lr = LogisticRegressionWithLBFGS()
    model_lr = lr.train(train_labels)
    model_lr.setThreshold(0.75)
    predictionAndLabels = cv_tfidf.rdd.map(lambda lp: (float(model_lr.predict(lp.features)), float(lp.label), lp.id, lp.text))
    predictionAndLabels_rdd = predictionAndLabels.map(lambda lp: (lp[0], lp[1]))
    metrics = BinaryClassificationMetrics(predictionAndLabels_rdd)
    output.append([i,metrics.areaUnderPR,metrics.areaUnderROC])
  
  return output

In [11]:
def count_posneg(input_df):
  return {"all":input_df.count(),"positive":input_df.filter(input_df.label==1).count(),"negative":input_df.filter(input_df.label==0).count()}

In [None]:
def combine(sv1,sv2):
  size = sv1.size + sv2.size
  max_ind = sv1.size
  indices1 = sv1.indices
  indices2 = [max_ind+x for x in sv2.indices]
  joined_indices = np.concatenate([indices1,indices2])
  values1 = sv1.values
  values2 = sv2.values
  joined_values = np.concatenate([values1,values2])
  return SparseVector(size,joined_indices,joined_values)

s1 = SparseVector(2,[0],[1])
s2 = SparseVector(3,[0,2],[1,1])
print combine(s1,s2)

In [None]:
def lower_word(hashtags):
  hashtags = [x.lower() for x in hashtags]
  return ' '.join(hashtags)

In [None]:
def feature_extraction_with_hash(input_df,number_hash,number_words):
  lower_hashtags = (input_df.rdd.map(lambda row:[lower_word(row['hashtags']),row['text'],row['label'],row['id']])).toDF(['hashtags','text','label','id'])
  
  #tokenization
  tok = RegexTokenizer(inputCol="text", outputCol="words")
  tok_hash = RegexTokenizer(inputCol='hashtags',outputCol='tags')
  
  tokenized_test_words = tok.transform(lower_hashtags)
  tokenized_all = tok_hash.transform(tokenized_test_words)
  
  #hashingTF
  tags_hash = HashingTF(inputCol="tags", outputCol="features_tags", numFeatures=number_hash)
  words_hash = HashingTF(inputCol="words", outputCol="features_words", numFeatures=number_words)
  tf_feat = tags_hash.transform(tokenized_all)
  tf_all = words_hash.transform(tf_feat)
  
  #combine SparseVectors
  tf_joined = (tf_all.rdd.map(lambda row:[combine(row['features_tags'],row['features_words']),row['text'],row['hashtags'],row['label'],row['id']])).toDF(['features','text','hashtags','label','id'])
  
  return tf_joined

## Loading dataset

In [12]:
raw_datastax_df = sqlContext.read.json(filename_datastax)
raw_porn_df = sqlContext.read.json(filename_porn)
raw_cassandra_df = sqlContext.read.json(filename_cass)
raw_spark_df = sqlContext.read.json(filename_spark)
raw_bigdata_df = sqlContext.read.json(filename_bigdata)
raw_softeng_df = sqlContext.read.json(filename_softeng)
raw_database_df = sqlContext.read.json(filename_database)

print 'Datastax tweets:\t',count_posneg(raw_datastax_df)
print 'Porn tweets:\t',count_posneg(raw_porn_df)
print 'Cassandra tweets:\t',count_posneg(raw_cassandra_df)
print 'Spark tweets:\t',count_posneg(raw_spark_df)
print 'Bigdata tweets:\t',count_posneg(raw_bigdata_df)
print 'Software eng:\t',count_posneg(raw_softeng_df)
print 'Database:\t',count_posneg(raw_database_df)

## Random sampling from sources

In [13]:
sampled_datastax_df = (raw_datastax_df.filter(raw_datastax_df.label==1)).randomSplit([0.5,0.5],23)[0]
sampled_spark_pos_df = (raw_spark_df.filter(raw_spark_df.label==1)).randomSplit([0.5,0.5],23)[0]
sampled_spark_neg_df = (raw_spark_df.filter(raw_spark_df.label==0)).randomSplit([0.5,0.5],23)[0]
sampled_bigdata_df = raw_bigdata_df.randomSplit([0.3,0.7],23)[0]
sampled_softeng_df = raw_softeng_df.randomSplit([0.3,0.7],23)[0]
sample_database_df = raw_database_df.randomSplit([0.1,0.7],23)[0]

print 'Sampled Datastax',sampled_datastax_df.count()
print 'Sampled Spark Positive',sampled_spark_pos_df.count()
print 'Sampled Spark Negative',sampled_spark_neg_df.count()
print 'Sampled Bigdata',sampled_bigdata_df.count()
print 'Sampled Software Eng',sampled_softeng_df.count()
print 'Sampled database',sample_database_df.count()

In [14]:
raw_sampled_df = sampled_bigdata_df.unionAll(sampled_datastax_df).unionAll(sampled_softeng_df).unionAll(sampled_spark_neg_df).unionAll(sampled_spark_pos_df).unionAll(raw_porn_df).unionAll(sample_database_df)
print 'Other sources sampled:',count_posneg(raw_sampled_df)

## Split into train and test dataset

In [15]:
test_data_df,raw_cass_df = raw_cassandra_df.randomSplit([0.4,0.6],24)
raw_dataset_df = raw_cass_df.unionAll(raw_sampled_df)

print 'Whole dataset:',count_posneg(raw_dataset_df)
print "Test dataset:",count_posneg(test_data_df)
train_df = raw_dataset_df
test_df = test_data_df

In [16]:
#train_df,test_df = raw_dataset_df.randomSplit([0.75,0.25],45)
#print 'Train set:',count_posneg(train_df)
#print 'Test set:',count_posneg(test_df)

## Feature extraction
Convert text tweets into label points for train

In [17]:
cleaned_train_df = clean_dataframe(train_df)
cleaned_test_df = clean_dataframe(test_df)
train_tf_df = feature_extraction_with_hash(cleaned_train_df,400,1900)
test_tf_df = feature_extraction_with_hash(cleaned_test_df,400,1900)
train_tf_df.show()
#tokens_train_df = tokenization(cleaned_train_df)
#tokens_test_df = tokenization(cleaned_test_df)
#train_tf_df = feature_extr_onlytf(tokens_train_df,1000).cache()
#test_tf_df = feature_extr_onlytf(tokens_test_df,1000).cache()
train_labels = make_label_points(train_tf_df).cache()

## Train model - Logistic regression with LBFGS

In [18]:
lr = LogisticRegressionWithLBFGS()
model = lr.train(train_labels,regParam=0.244)

## Print out all predicted probabilities from test dataset

In [19]:
model.setThreshold(0.75)
predicted = test_tf_df.rdd.map(lambda l:(float(model.predict(l['features'])),float(l['label']),l['text'],l['id']))
for pred in predicted.collect():
  print pred

In [20]:
predictionAndLabels_rdd = predicted.map(lambda lp: (lp[0], lp[1]))

metrics = BinaryClassificationMetrics(predictionAndLabels_rdd)

# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)

In [None]:
def find_feature_num_hash(train_df,test_df,hash_start,hash_end,words_start,words_end,step):
  threshold = 0.75
  output = []
  for i in range(hash_start,hash_end,step):
    for j in range(words_start,words_end,step):
      train = feature_extraction_with_hash(train_df,i,j)
      test = feature_extraction_with_hash(test_df,i,j)
      
      train_labels = make_label_points(train)
      lr = LogisticRegressionWithLBFGS()
      model = lr.train(train_labels,regParam=0.244)
      
      model.setThreshold(threshold)
      predictionAndLabels = test.rdd.map(lambda lp: (float(model.predict(lp.features)), float(lp.label), lp.id, lp.text))
      predictionAndLabels_rdd = predictionAndLabels.map(lambda lp: (lp[0], lp[1]))
      metrics = BinaryClassificationMetrics(predictionAndLabels_rdd)
      output.append([i,j,metrics.areaUnderPR,metrics.areaUnderROC])
  
  return output

In [None]:
def find_reg_param_with_hash(labels_train,test_df,reg_start,reg_end,reg_step):
  output = []
  i = reg_start
  while i<=reg_end:
      lr = LogisticRegressionWithLBFGS()
      model = lr.train(labels_train,regParam=i)
      
      model.setThreshold(0.75)
      predictionAndLabels = test_df.rdd.map(lambda lp: (float(model.predict(lp.features)), float(lp.label), lp.id, lp.text))
      predictionAndLabels_rdd = predictionAndLabels.map(lambda lp: (lp[0], lp[1]))
      metrics = BinaryClassificationMetrics(predictionAndLabels_rdd)
      output.append([i,metrics.areaUnderPR,metrics.areaUnderROC])
      i+=reg_step
    
  return output

In [None]:
#output = find_reg_param_with_hash(train_labels,test_tf_df,0.19,0.25,0.002)
#for out in output:
#  print out

In [None]:
#output = find_feature_num_hash(cleaned_train_df,cleaned_test_df,100,800,500,2000,100)
#for out in output:
#  print out

In [21]:
#output = find_feature_num(train_df,test_df,400,1500,100)
#for out in output:
#  print out

In [22]:
#0.05 0.75
#for out in reg_params(train_labels,test_tf_df,0.7,0.81,0.01,0.03,0.1,0.004):
#  print out

## Save model into file

In [23]:
#model.save(sc,'/FileStore/tables/oaohfspx1472031253448/trained_model.lrm')