In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer,StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
import numpy as np
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes
from pyspark.sql import SQLContext
import pyspark as ps
from pyspark.ml.classification import LinearSVC
import warnings
from nltk.stem.snowball import SnowballStemmer
import pandas
import matplotlib.pyplot as plt
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
import pandas as pd


def init_spark():
    print("initializing spark...")
    try:
        sc = ps.SparkContext('local[*]')
        sqlContext = SQLContext(sc)
        print("Just created a SparkContext")
    except ValueError:
        warnings.warn("SparkContext already exists in this scope")
    spark=SparkSession.builder.getOrCreate()

def read_file(fileName):
    print("reading csv file...")
    df=spark.read.csv(fileName,sep=",",inferSchema=True,header=False)
    return df


def pre_process(df):
    print("preprocessing...")
    df.count()
    df1=df.withColumnRenamed('_c0',"id").withColumnRenamed('_c1','label').withColumnRenamed('_c2','tweet')
    df2 = df1.withColumn('tweet', regexp_replace('tweet', '[^a-z0-9A-Z`~!@#$%&<>?., ]', ''))
    df3 = df2.withColumn('tweet', regexp_replace('tweet', '[0-9`~!@#$%&<>?,\']', ''))
    df4 = df3.withColumn('tweet', regexp_replace('tweet', 'http://*.*.com', ''))
    df5 = df4.withColumn('tweet', regexp_replace('tweet', 'www.*.com', ''))
    df6 = df5.withColumn('tweet', regexp_replace('tweet', '\.', ''))
    tokenizer=Tokenizer(inputCol="tweet",outputCol="words")
    wordData=tokenizer.transform(df6)
    remover=StopWordsRemover(inputCol="words",outputCol="word_clean")
    word_clean_data=remover.transform(wordData)
    stemmer = SnowballStemmer(language='english')
    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens])
    count=CountVectorizer(inputCol="word_clean",outputCol="rawFeatures")
    model=count.fit(word_clean_data)
    featurizedData=model.transform(word_clean_data)
    idf=IDF(inputCol="rawFeatures",outputCol="features")
    idfModel=idf.fit(featurizedData)
    rescaledData=idfModel.transform(featurizedData)
    return rescaledData


def train_test_split(df,train=0.7,test=0.3):
    print("splitting dataset...")
    seed=0
    trainDf,testDf=df.randomSplit([train,test],seed)
    trainDf.count()
    testDf.count()
    return trainDf,testDf

In [2]:
def random_Forest(train_data,test_data):
    print("................................................................................................")
    print("Using Random Forest model with test_data...")
    d1 = {}
    d2 = {}
    rf = RandomForestClassifier()
    rf_model = rf.fit(train_data)
    
    train_pred = rf_model.transform(train_data)
    print(train_pred.groupBy('label','prediction').count().show())
    
    my_eval_rf = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')
    p1 = my_eval_rf.evaluate(train_pred)
    
    my_mc_rf = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')
    p2 = my_mc_rf.evaluate(train_pred)
    
    my_mc_rf = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
    p3 = my_mc_rf.evaluate(train_pred)
    
    d1['ROC'] = p1
    d1['F1'] = p2
    d1['Accuracy'] = p3
    
    test_pred = rf_model.transform(test_data)
    print(test_pred.groupBy('label','prediction').count().show())
    my_eval_rf = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')
    p4 = my_eval_rf.evaluate(test_pred)
    
    my_mc_rf = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')
    p5 = my_mc_rf.evaluate(test_pred)
    
    my_mc_rf = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
    p6 = my_mc_rf.evaluate(test_pred)
    
    d2['ROC'] = p4
    d2['F1']= p5
    d2['Accuracy'] = p6
    print("................................................................................................")
    return d1,d2

In [3]:
init_spark()
df=read_file("twitter.csv")
df=pre_process(df)
train_data,test_data=train_test_split(df)

initializing spark...
reading csv file...




preprocessing...
splitting dataset...


In [4]:
print(".........................................................................................")
import time
start=time.time()
train_summary,test_summary=random_Forest(train_data,test_data)
print(train_summary)
print(test_summary)
elapsed=time.time()-start
print("Time elapsed:",elapsed/60," mins.")
print(".........................................................................................")


.........................................................................................
................................................................................................
Using Random Forest model with test_data...
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0|20737|
|    1|       0.0| 1573|
+-----+----------+-----+

None
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|  676|
|    0|       0.0| 8976|
+-----+----------+-----+

None
................................................................................................
{'ROC': 0.5, 'F1': 0.8955284560337493, 'Accuracy': 0.9294935006723443}
{'ROC': 0.5, 'F1': 0.8962148607931361, 'Accuracy': 0.9299627020306672}
Time elapsed: 10.163563191890717  mins.
.........................................................................................
