In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer,StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
import numpy as np
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes
from pyspark.sql import SQLContext
import pyspark as ps
from pyspark.ml.classification import LinearSVC
import warnings
from nltk.stem.snowball import SnowballStemmer
import pandas
import matplotlib.pyplot as plt
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
import pandas as pd


def init_spark():
    print("initializing spark...")
    try:
        sc = ps.SparkContext('local[*]')
        sqlContext = SQLContext(sc)
        print("Just created a SparkContext")
    except ValueError:
        warnings.warn("SparkContext already exists in this scope")
    spark=SparkSession.builder.getOrCreate()

def read_file(fileName):
    print("reading csv file...")
    df=spark.read.csv(fileName,sep=",",inferSchema=True,header=False)
    return df


def pre_process(df):
    print("preprocessing...")
    df.count()
    df1=df.withColumnRenamed('_c0',"id").withColumnRenamed('_c1','label').withColumnRenamed('_c2','tweet')
    df2 = df1.withColumn('tweet', regexp_replace('tweet', '[^a-z0-9A-Z`~!@#$%&<>?., ]', ''))
    df3 = df2.withColumn('tweet', regexp_replace('tweet', '[0-9`~!@#$%&<>?,\']', ''))
    df4 = df3.withColumn('tweet', regexp_replace('tweet', 'http://*.*.com', ''))
    df5 = df4.withColumn('tweet', regexp_replace('tweet', 'www.*.com', ''))
    df6 = df5.withColumn('tweet', regexp_replace('tweet', '\.', ''))
    tokenizer=Tokenizer(inputCol="tweet",outputCol="words")
    wordData=tokenizer.transform(df6)
    remover=StopWordsRemover(inputCol="words",outputCol="word_clean")
    word_clean_data=remover.transform(wordData)
    stemmer = SnowballStemmer(language='english')
    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens])
    count=CountVectorizer(inputCol="word_clean",outputCol="rawFeatures")
    model=count.fit(word_clean_data)
    featurizedData=model.transform(word_clean_data)
    idf=IDF(inputCol="rawFeatures",outputCol="features")
    idfModel=idf.fit(featurizedData)
    rescaledData=idfModel.transform(featurizedData)
    return rescaledData


def train_test_split(df,train=0.7,test=0.3):
    print("splitting dataset...")
    seed=0
    trainDf,testDf=df.randomSplit([train,test],seed)
    trainDf.count()
    testDf.count()
    return trainDf,testDf

In [2]:
def logistic_regression(train_data,test_data,maxIt=15,nFolds=8):
    print("................................................................................................")
    print("Using logistic regression model with test_data...")
    d1 = {}
    d2 = {}
    
    lr = LogisticRegression(maxIter=maxIt)
    paramGrid_lr = ParamGridBuilder().build()
    
    crossval_lr = CrossValidator(estimator=lr,estimatorParamMaps=paramGrid_lr,evaluator=BinaryClassificationEvaluator(),numFolds=nFolds)
    cvModel_lr = crossval_lr.fit(train_data)
    best_model_lr = cvModel_lr.bestModel.summary
    
    report = BinaryClassificationEvaluator(rawPredictionCol="prediction",labelCol="label",metricName="areaUnderROC")
    p1 = report.evaluate(best_model_lr.predictions)
    
    pred_lr = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label",metricName="f1")
    p2 = pred_lr.evaluate(best_model_lr.predictions)
    
    pred_lr = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="label",metricName="accuracy")
    p3 = pred_lr.evaluate(best_model_lr.predictions)
    
    train_fit_lr = best_model_lr.predictions.select('label','prediction')
    print(train_fit_lr.groupBy('label','prediction').count().show())
    
    d1['ROC'] = p1
    d1['F1'] = p2
    d1['Accuracy'] = p3
    
    predictions_lr = cvModel_lr.transform(test_data)
    print(predictions_lr.groupBy('label','prediction').count().show())
    
    my_eval_lr = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')
    p4=my_eval_lr.evaluate(predictions_lr)
    
    my_mc_lr = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
    p5=my_mc_lr.evaluate(predictions_lr)
    
    my_mc_lr = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')
    p6=my_mc_lr.evaluate(predictions_lr)
    
    d2['ROC'] = p4
    d2['F1']= p5
    d2['Accuracy'] = p6
    print("................................................................................................")
    return d1,d2

In [3]:
init_spark()
df=read_file("twitter.csv")
df=pre_process(df)
train_data,test_data=train_test_split(df)

initializing spark...
reading csv file...




preprocessing...
splitting dataset...


In [5]:
import time
d={}
e={}
#training the model and checking its accuracy
#for varying values of numFolds
for i in range(5,25,5):
    start=time.time()
    train_summary,test_summary=logistic_regression(train_data,test_data,20,i)
    print(train_summary)
    print(test_summary)
    d[i]=train_summary
    e[i]=test_summary
    elapsed=time.time()-start
    print("Time elapsed:",elapsed/60," mins.")

print("................................................................................................")
print("................................................................................................")
print("Train summary for varying no of numFolds:")
print("Nfolds        Accuracy")
for nf,acc in d.items():
    print(nf,"        ",acc['Accuracy'])
print("................................................................................................")
print("................................................................................................")
print("Test summary for varying no of numFolds:")
print("Nfolds        Accuracy")
for nf,acc in e.items():
    print(nf,"        ",acc['Accuracy'])
print("................................................................................................")
print("................................................................................................")

................................................................................................
Using logistic regression model with test_data...
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  0.0|       0.0|20736|
|  1.0|       1.0| 1568|
|  1.0|       0.0|    5|
|  0.0|       1.0|    1|
+-----+----------+-----+

None
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|  278|
|    0|       0.0| 8794|
|    1|       1.0|  398|
|    0|       1.0|  182|
+-----+----------+-----+

None
................................................................................................
{'ROC': 0.9983865687373251, 'F1': 0.9997309040827688, 'Accuracy': 0.9997310623038996}
{'ROC': 0.7842405520572941, 'F1': 0.9523414836303357, 'Accuracy': 0.9506468904290225}
Time elapsed: 11.502746252218882  mins.
................................................................................................
Using logistic regression model wit

In [6]:
train_data,test_data=train_test_split(df,0.5,0.5)
#checking accuracy of the model when
#the train and test data is split 50-50

splitting dataset...


In [7]:
start=time.time()
train_summary,test_summary=logistic_regression(train_data,test_data)
print(train_summary)
print(test_summary)
elapsed=time.time()-start
print("Time elapsed:",elapsed/60," mins.")

................................................................................................
Using logistic regression model with test_data...
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 1127|
|  0.0|       0.0|14919|
|  1.0|       0.0|    6|
|  0.0|       1.0|    1|
+-----+----------+-----+

None
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|  513|
|    0|       0.0|14516|
|    1|       1.0|  603|
|    0|       1.0|  277|
+-----+----------+-----+

None
................................................................................................
{'ROC': 0.9973186503363629, 'F1': 0.9995634988117358, 'Accuracy': 0.9995639444340622}
{'ROC': 0.7607987539878277, 'F1': 0.9503425733861336, 'Accuracy': 0.9476033989194182}
Time elapsed: 13.044400397936503  mins.


In [None]:
print("................................................................................................")
print("................................................................................................")
print("Train summary for varying no of numFolds:")
print("Nfolds        Accuracy")
for nf,acc in d.items():
    print(nf,"        ",acc['Accuracy'])
print("................................................................................................")
print("................................................................................................")
print("Test summary for varying no of numFolds:")
print("Nfolds        Accuracy")
for nf,acc in e.items():
    print(nf,"        ",acc['Accuracy'])
print("................................................................................................")
print("................................................................................................")