# Twitter #cassandra filter
## Using tweet text, hashtags and user description

In [1]:
from pyspark.sql import functions as funs
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.classification import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.linalg import SparseVector
from pyspark.sql import Row
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.classification import LogisticRegressionModel
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import numpy as np

## Enter path of files (datasets)

In [2]:
filename_update = '/FileStore/tables/rtqt0l191472460960965/cass_data28.json'
filename_datastax = '/FileStore/tables/v1a51v731472210317176/data_stax_dataset.json'
filename_bigdata = '/FileStore/tables/v1a51v731472210317176/big_data_dataset.json'
filename_porn = '/FileStore/tables/v1a51v731472210317176/porn_tweets.json'
filename_cass = '/FileStore/tables/v1a51v731472210317176/cassandra_dataset.json'
filename_spark = '/FileStore/tables/v1a51v731472210317176/spark_tweets_dataset.json'
filename_softeng = '/FileStore/tables/v1a51v731472210317176/sweng_dataset.json'
filename_database = '/FileStore/tables/v1a51v731472210317176/database_dataset.json'

# Cleans input data frame of all unnecessary characters

In [3]:
def clean_dataframe(input_df):
  onlywords_df = input_df.select(funs.lower(
      funs.regexp_replace(
        input_df.text,'(\^|\?|\!|\$|\(|\)|\\b\\w{1}\\b\\s?|\"|\'|RT|:|\.|@[^\s]+|http[^\s]+|#|[0-9]+)','')).alias('text'),'label','id')
  return onlywords_df.select(funs.trim(funs.regexp_replace(onlywords_df.text,'(\+|_|-|\,|\s+)',' ')).alias('text'),'label','id')

## For input data frame count number of positive and negative labels

In [4]:
def count_posneg(input_df):
  return {"all":input_df.count(),"positive":input_df.filter(input_df.label==1).count(),"negative":input_df.filter(input_df.label==0).count()}

## Extends input data frame with one additional column 'user_description'

In [5]:
def add_missing_column(input_df):
  return input_df.rdd.map(lambda row:[row['id'],row['hashtags'],row['text'],row['label'],'']).toDF(['id','hashtags','text','label','user_description'])

## Combine string with list of strings
'Some text', ['one', 'two', 'three'] ----> Some text one two three

In [6]:
def combine_hash_text(text,l):
  return text+' '+' '.join(l)

## Read dataset into data frames

In [7]:
raw_datastax_df = sqlContext.read.json(filename_datastax)
raw_porn_df = sqlContext.read.json(filename_porn)
raw_cassandra_df = sqlContext.read.json(filename_cass)
raw_spark_df = sqlContext.read.json(filename_spark)
raw_bigdata_df = sqlContext.read.json(filename_bigdata)
raw_softeng_df = sqlContext.read.json(filename_softeng)
raw_database_df = sqlContext.read.json(filename_database)
raw_cassandra_update_df = sqlContext.read.json(filename_update)

print 'Datastax tweets:\t',count_posneg(raw_datastax_df)
print 'Porn tweets:\t',count_posneg(raw_porn_df)
print 'Cassandra tweets:\t',count_posneg(raw_cassandra_df)
print 'Spark tweets:\t',count_posneg(raw_spark_df)
print 'Bigdata tweets:\t',count_posneg(raw_bigdata_df)
print 'Software eng:\t',count_posneg(raw_softeng_df)
print 'Database:\t',count_posneg(raw_database_df)
print 'Cassandra_update:\t',count_posneg(raw_cassandra_update_df)

## Sample some of tweets from dataset, for better positive/negative ratio

In [8]:
sampled_datastax_df = (raw_datastax_df.filter(raw_datastax_df.label==1)).randomSplit([0.5,0.5],23)[0]
sampled_spark_pos_df = (raw_spark_df.filter(raw_spark_df.label==1)).randomSplit([0.5,0.5],23)[0]
sampled_spark_neg_df = (raw_spark_df.filter(raw_spark_df.label==0)).randomSplit([0.5,0.5],23)[0]
sampled_bigdata_df = raw_bigdata_df.randomSplit([0.3,0.7],23)[0]
sampled_softeng_df = raw_softeng_df.randomSplit([0.3,0.7],23)[0]
sampled_database_df = raw_database_df.randomSplit([0.1,0.7],23)[0]

print 'Sampled Datastax',sampled_datastax_df.count()
print 'Sampled Spark Positive',sampled_spark_pos_df.count()
print 'Sampled Spark Negative',sampled_spark_neg_df.count()
print 'Sampled Bigdata',sampled_bigdata_df.count()
print 'Sampled Software Eng',sampled_softeng_df.count()
print 'Sampled database',sampled_database_df.count()

## Add missing columns
(only for datasets that dont have user_description, to avoid error in merging)

In [9]:
sampled_datastax_df = add_missing_column(sampled_datastax_df)
sampled_database_df = add_missing_column(sampled_database_df)
sampled_bigdata_df = add_missing_column(sampled_bigdata_df)
sampled_softeng_df = add_missing_column(sampled_softeng_df)
sampled_spark_neg_df = add_missing_column(sampled_spark_neg_df)
sampled_spark_pos_df = add_missing_column(sampled_spark_pos_df)
sampled_porn_df = add_missing_column(raw_porn_df).select('id','hashtags','text','label','user_description')
sampled_cass_df = add_missing_column(raw_cassandra_df).select('id','hashtags','text','label','user_description')
sampled_cass_u_df = raw_cassandra_update_df.select('id','hashtags','text','label','user_description')

## Merge all datasets into big one data frame

In [10]:
dataset_df = sampled_bigdata_df.unionAll(sampled_database_df).unionAll(sampled_cass_df).unionAll(sampled_datastax_df).unionAll(sampled_porn_df).unionAll(sampled_softeng_df).unionAll(sampled_spark_neg_df).unionAll(sampled_spark_pos_df).unionAll(sampled_cass_u_df)
dataset_df.show()
print count_posneg(dataset_df)

## Combine text, hashtags and user_description columns

In [11]:
dataset_combined_df = (dataset_df.rdd
              .map(lambda row:[row['id'],float(row['label']),combine_hash_text(row['text'],row['hashtags'])+' '+row['user_description']])).toDF(['id','label','text'])

## Clean dataframe of thrash characters

In [12]:
cleaned_dataset_df = clean_dataframe(dataset_combined_df)

In [13]:
''' Split with cv'''
'''
split_ratio = [0.6,0.2,0.2]
split_seed = 212
train_df,cv_df,test_df = cleaned_dataset_df.randomSplit(split_ratio,split_seed)
train_df.cache()
cv_df.cache()
test_df.cache()
print 'Train set:',count_posneg(train_df)
print 'Cv set:',count_posneg(cv_df)
print 'Test set:',count_posneg(test_df)
'''

## Split datasets into train and test dataset

In [14]:
train_df,test_df = cleaned_dataset_df.randomSplit([0.8,0.2],213)
train_df.cache()
test_df.cache()

In [15]:
tokenizer = RegexTokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

## Perform test for given parameters and return results of classification on test set

In [16]:
def train_models(train_df,cv_df,feature_nums,regularizations,thresholds):
  output = []
  tokenizer = RegexTokenizer(inputCol="text", outputCol="words")
  hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
  for ft in feature_nums:
    train_tokens_df = tokenizer.transform(train_df)
    cv_tokens_df = tokenizer.transform(cv_df)
    hashingTF.setNumFeatures(ft)
    hashed_train_df = hashingTF.transform(train_tokens_df)
    hashed_cv_df = hashingTF.transform(cv_tokens_df)
    train_labels = hashed_train_df.rdd.map(lambda row: LabeledPoint(row['label'],row['features']))
    for threshold in thresholds:
      for reg in regularizations:
        model = LogisticRegressionWithLBFGS()
        trained = model.train(train_labels,regParam=reg)
        trained.setThreshold(threshold)
        predicted = hashed_cv_df.rdd.map(lambda l:(float(trained.predict(l['features'])),float(l['label']),l['text'],l['id']))
        predictionAndLabels_rdd = predicted.map(lambda lp: (lp[0], lp[1]))

        metrics = BinaryClassificationMetrics(predictionAndLabels_rdd)
        entry = {'features':ft, 'regularization':reg,'threshold':threshold,'under ROC':metrics.areaUnderROC,'under PR':metrics.areaUnderPR}
        output.append(entry)
    
   
  return output

In [17]:
#results = train_models(train_df,test_df,[800,1600],[0.01,0.04,0.09,0.1,0.13,0.15],[0.7,0.75])
#for res in results:
#  print res

## Tokenize and HashTF words in text of dataframe, and then train LogisticRegressionModel

In [18]:
hashingTF.setNumFeatures(1600)
tokenized = tokenizer.transform(train_df)
hashed = hashingTF.transform(tokenized)
train_labels = hashed.rdd.map(lambda row: LabeledPoint(row['label'],row['features']))
lr = LogisticRegressionWithLBFGS()
model = lr.train(train_labels,regParam=0.04)
model.setThreshold(0.7)

## Prepare, predict and print test set

In [19]:
model.clearThreshold()
hashed_test = hashingTF.transform(tokenizer.transform(test_df))
predicted = hashed_test.rdd.map(lambda l:(float(model.predict(l['features'])),float(l['label']),l['text'],l['id']))

In [20]:
for pred in predicted.collect():
  print pred

In [21]:
#model.save(sc,'/FileStore/models/lr_29_8.lrm')