In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer,StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
import numpy as np
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes
from pyspark.sql import SQLContext
import pyspark as ps
from pyspark.ml.classification import LinearSVC
import warnings
from nltk.stem.snowball import SnowballStemmer
import pandas
import matplotlib.pyplot as plt
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
import pandas as pd

In [2]:
def init_spark():
    try:
        sc = ps.SparkContext('local[*]')
        sqlContext = SQLContext(sc)
        print("Just created a SparkContext")
    except ValueError:
        warnings.warn("SparkContext already exists in this scope")
    spark=SparkSession.builder.getOrCreate()

In [3]:
def read_file(fileName):
    df=spark.read.csv(fileName,sep=",",inferSchema=True,header=False)
    return df

In [18]:
def pre_process(df):
#     df.printSchema()
#     df.show(truncate=False)
    df.count()
    
    df1=df.withColumnRenamed('_c0',"id").withColumnRenamed('_c1','label').withColumnRenamed('_c2','tweet')
#     df1.printSchema()
#     df1.show(truncate=False)
    
    df2 = df1.withColumn('tweet', regexp_replace('tweet', '[^a-z0-9A-Z`~!@#$%&<>?., ]', ''))
#     df2.show(truncate=False)
    
    df3 = df2.withColumn('tweet', regexp_replace('tweet', '[0-9`~!@#$%&<>?,\']', ''))
#     df3.show(truncate=False)
    
    df4 = df3.withColumn('tweet', regexp_replace('tweet', 'http://*.*.com', ''))
#     df4.show(truncate=False)
    
    df5 = df4.withColumn('tweet', regexp_replace('tweet', 'www.*.com', ''))
#     df5.show(truncate=False)
    
    df6 = df5.withColumn('tweet', regexp_replace('tweet', '\.', ''))
#     df6.show(truncate=False)
    
    tokenizer=Tokenizer(inputCol="tweet",outputCol="words")
    wordData=tokenizer.transform(df6)
#     wordData.show()
    
    remover=StopWordsRemover(inputCol="words",outputCol="word_clean")
    word_clean_data=remover.transform(wordData)
#     word_clean_data.show()
    
    stemmer = SnowballStemmer(language='english')
    stemmer_udf = udf(lambda tokens: [stemmer.stem(token) for token in tokens])
    count=CountVectorizer(inputCol="word_clean",outputCol="rawFeatures")
#     print(count)
    
    model=count.fit(word_clean_data)
#     print(model)
    
    featurizedData=model.transform(word_clean_data)
#     featurizedData.show()
    idf=IDF(inputCol="rawFeatures",outputCol="features")
    idfModel=idf.fit(featurizedData)
    rescaledData=idfModel.transform(featurizedData)
#     rescaledData.select("label","features").show()
    return rescaledData

In [1]:
def train_test_split(df):
    seed=0
    trainDf,testDf=df.randomSplit([0.7,0.3],seed)
    trainDf.count()
    testDf.count()
    return trainDf,testDf

In [None]:
def logistic regression(train_data,test_data):
    #complete this function
    #similarly write functions for other methods
    #where train and test data are provided as arguments

In [19]:
init_spark()
df=read_file("twitter.csv")
df=pre_process(df)
train_data,test_data=train_test_split(df)
train_data.show(5)

  import sys


+---+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|label|               tweet|               words|          word_clean|         rawFeatures|            features|
+---+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  2|    0|user user thanks ...|[user, user, than...|[user, user, than...|(39572,[0,1,19,22...|(39572,[0,1,19,22...|
|  3|    1|user that was fuc...|[user, that, was,...|[user, fucking, w...|(39572,[1,525,126...|(39572,[1,525,126...|
|  4|    1|userthat was so s...|[userthat, was, s...|  [userthat, shitty]|(39572,[2934,3100...|(39572,[2934,3100...|
|  6|    0| huge fan fare an...|[, huge, fan, far...|[, huge, fan, far...|(39572,[0,15,107,...|(39572,[0,15,107,...|
|  7|    0| user camping tom...|[, user, camping,...|[, user, camping,...|(39572,[0,1,57,19...|(39572,[0,1,57,19...|
+---+-----+--------------------+--------------------+-----------