In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer,StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
import numpy as np
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes
from pyspark.sql import SQLContext
import pyspark as ps
from pyspark.ml.classification import LinearSVC
import warnings
from nltk.stem.snowball import SnowballStemmer
import pandas
import matplotlib.pyplot as plt
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
import pandas as pd


def init_spark():
    print("initializing spark...")
    try:
        sc = ps.SparkContext('local[*]')
        sqlContext = SQLContext(sc)
        print("Just created a SparkContext")
    except ValueError:
        warnings.warn("SparkContext already exists in this scope")
    spark=SparkSession.builder.getOrCreate()

def read_file(fileName):
    print("reading csv file...")
    df=spark.read.csv(fileName,sep=",",inferSchema=True,header=False)
    return df


def pre_process(df):
    print("preprocessing...")
    df.count()
    df1=df.withColumnRenamed('_c0',"id").withColumnRenamed('_c1','label').withColumnRenamed('_c2','tweet')
    df2 = df1.withColumn('tweet', regexp_replace('tweet', '[^a-z0-9A-Z`~!@#$%&<>?., ]', ''))
    df3 = df2.withColumn('tweet', regexp_replace('tweet', '[0-9`~!@#$%&<>?,\']', ''))
    df4 = df3.withColumn('tweet', regexp_replace('tweet', 'http://*.*.com', ''))
    df5 = df4.withColumn('tweet', regexp_replace('tweet', 'www.*.com', ''))
    df6 = df5.withColumn('tweet', regexp_replace('tweet', '\.', ''))
    tokenizer=Tokenizer(inputCol="tweet",outputCol="words")
    wordData=tokenizer.transform(df6)
    remover=StopWordsRemover(inputCol="words",outputCol="word_clean")
    word_clean_data=remover.transform(wordData)
    stemmer = SnowballStemmer(language='english')
    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens])
    count=CountVectorizer(inputCol="word_clean",outputCol="rawFeatures")
    model=count.fit(word_clean_data)
    featurizedData=model.transform(word_clean_data)
    idf=IDF(inputCol="rawFeatures",outputCol="features")
    idfModel=idf.fit(featurizedData)
    rescaledData=idfModel.transform(featurizedData)
    return rescaledData


def train_test_split(df,train=0.7,test=0.3):
    print("splitting dataset...")
    seed=0
    trainDf,testDf=df.randomSplit([train,test],seed)
    trainDf.count()
    testDf.count()
    return trainDf,testDf

In [2]:
def naive_Bayes(train_data,test_data,nFolds=5):
    print("................................................................................................")
    print("Using Naive Bayes model with test_data...numFolds=",nFolds)
    d1 = {}
    d2 = {}
    nb = NaiveBayes()
    
    paramGrid_nb = ParamGridBuilder() \
        .addGrid(nb.smoothing, np.linspace(0.3, 10, 10)) \
        .build()
    crossval_nb = CrossValidator(estimator=nb, estimatorParamMaps=paramGrid_nb, evaluator=BinaryClassificationEvaluator(), numFolds= nFolds)
    cvModel_nb = crossval_nb.fit(train_data)
    
    train_pred = cvModel_nb.transform(train_data)
    print(train_pred.groupBy('label','prediction').count().show())
    
    my_eval_nb = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')
    p1 = my_eval_nb.evaluate(train_pred)
    
    my_mc_nb = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')
    p2 = my_mc_nb.evaluate(train_pred)
    
    my_mc_nb = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
    p3 = my_mc_nb.evaluate(train_pred)
    
    d1['ROC'] = p1
    d1['F1'] = p2
    d1['Accuracy'] = p3
    
    predictions_nb = cvModel_nb.transform(test_data)
    print(predictions_nb.groupBy('label','prediction').count().show())
    
    my_eval_nb = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')
    p4 = my_eval_nb.evaluate(predictions_nb)
    
    my_mc_nb = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')
    p5 = my_mc_nb.evaluate(predictions_nb)
    
    my_mc_nb = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
    p6 = my_mc_nb.evaluate(predictions_nb)
    
    d2['ROC'] = p4
    d2['F1']= p5
    d2['Accuracy'] = p6
    print("................................................................................................")
    
    return d1,d2

In [4]:
init_spark()
df=read_file("twitter.csv")
df=pre_process(df)
train_data,test_data=train_test_split(df)

import time
d={}
e={}
#training the model and checking its accuracy
#for varying values of numFolds
for i in range(10,50,10):
    start=time.time()
    train_summary,test_summary=naive_Bayes(train_data,test_data)
    print(train_summary)
    print(test_summary)
    d[i]=train_summary
    e[i]=test_summary
    elapsed=time.time()-start
    print("Time elapsed:",elapsed/60," mins.")
print("................................................................................................")
print("................................................................................................")
print("Train summary for varying no of numFolds:")
print("Nfolds        Accuracy")
for nf,acc in d.items():
    print(nf,"        ",acc['Accuracy'])
print("................................................................................................")
print("................................................................................................")
print("Test summary for varying no of numFolds:")
print("Nfolds        Accuracy")
for nf,acc in e.items():
    print(nf,"        ",acc['Accuracy'])
print("................................................................................................")
print("................................................................................................")

initializing spark...
reading csv file...




preprocessing...
splitting dataset...
................................................................................................
Using Naive Bayes model with test_data...numFolds= 5
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    0|       0.0|20316|
|    1|       1.0| 1546|
|    1|       0.0|   27|
|    0|       1.0|  421|
+-----+----------+-----+

None
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|  109|
|    0|       0.0| 7363|
|    0|       1.0| 1613|
|    1|       1.0|  567|
+-----+----------+-----+

None
................................................................................................
{'ROC': 0.9812667352988343, 'F1': 0.9809405056058967, 'Accuracy': 0.9799193186911699}
{'ROC': 0.8295279852123744, 'F1': 0.8604105048405976, 'Accuracy': 0.8215913800248653}
Time elapsed: 8.46548907359441  mins.
.............................................................................................

In [5]:
train_data,test_data=train_test_split(df,0.5,0.5)
#checking accuracy of the model when
#the train and test data is split 50-50

start=time.time()
train_summary,test_summary=naive_Bayes(train_data,test_data)
print(train_summary)
print(test_summary)
elapsed=time.time()-start
print("Time elapsed:",elapsed/60," mins.")

splitting dataset...
................................................................................................
Using Naive Bayes model with test_data...numFolds= 5
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       1.0| 1116|
|    0|       0.0|14667|
|    1|       0.0|   17|
|    0|       1.0|  253|
+-----+----------+-----+

None
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|  183|
|    0|       0.0|12002|
|    0|       1.0| 2791|
|    1|       1.0|  933|
+-----+----------+-----+

None
................................................................................................
{'ROC': 0.9840192411898468, 'F1': 0.9839069088040566, 'Accuracy': 0.9831807138852551}
{'ROC': 0.8236755941672499, 'F1': 0.8543911741190956, 'Accuracy': 0.8130617889245081}
Time elapsed: 10.216308871905008  mins.
