In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType

spark = SparkSession.builder \
        .master("local") \
        .appName("Natural Language Processing") \
        .config("spark.executor.memory", "2gb") \
        .getOrCreate()
df = spark.read.format('com.databricks.spark.csv')\
        .options(header='true', inferschema='true')\
        .load('../input/fraud-email-dataset/fraud_email_.csv')
df = df.selectExpr("Class as label", "Text")
df = df.withColumn("label",df["label"].cast(IntegerType()))
print (df.printSchema())
df = spark.createDataFrame(df.head(2000), df.schema)
df.show()

In [None]:
print((df.count(), len(df.columns)))

In [None]:
df.show()

In [None]:
print (df.filter(df.label.isNotNull()).count())
df = df.na.drop(subset=["label"])
print (df.filter(df.label.isNotNull()).count())

In [None]:
df.show(20)

In [None]:
df.groupBy("label") \
    .count() \
    .orderBy("count", ascending = False) \
    .show()

In [None]:
import pyspark.sql.functions as F
df = df.withColumn('word_count',F.size(F.split(F.col('Text'),' ')))
df.show(10)

In [None]:
df.filter(df["label"].isin(['0','1'])).collect()
df.show(10)

In [None]:
print((df.count(), len(df.columns)))

In [None]:
%%time
from pyspark.sql.functions import col, lower, regexp_replace, split

def clean_text(reqText):
    reqText = lower(reqText)
    reqText = regexp_replace(reqText, "=2e", "")
    reqText = regexp_replace(reqText, "=2c", "")
    reqText = regexp_replace(reqText, "\=", "")
    reqText = regexp_replace(reqText, "news.website.http\:\/.*\/.*502503.stm.", "")
    reqText = regexp_replace(reqText, "http://www.forcetacticalarmy.com","")
    reqText = regexp_replace(reqText, "\'s", " ")
    reqText = regexp_replace(reqText, "\'", " ")
    reqText = regexp_replace(reqText, ":", " ")
    reqText = regexp_replace(reqText, "_", " ")
    reqText = regexp_replace(reqText, "-", " ")
    reqText = regexp_replace(reqText, "\'ve", " have ")
    reqText = regexp_replace(reqText, "can't", "can not ")
    reqText = regexp_replace(reqText, "n't", " not ")
    reqText = regexp_replace(reqText, "i'm", "i am ")
    reqText = regexp_replace(reqText, "\'re", " are ")
    reqText = regexp_replace(reqText, "\'d", " would ")
    reqText = regexp_replace(reqText, "\d", "")	
    reqText = regexp_replace(reqText, "\b[a-zA-Z]\b","")
    reqText = regexp_replace(reqText, "[\,|\.|\&|\;|<|>]","")
    reqText = regexp_replace(reqText, "\S*@\S*", " ")
    return reqText

clean_text_df = df.select(clean_text(col("Text")).alias("Text"),col('label'))

clean_text_df.printSchema()
clean_text_df.show(10)

In [None]:
clean_text_df = clean_text_df.withColumn('words',F.split(F.col('Text'),' '))
clean_text_df.show()

In [None]:
stop_words = ['i','me','my','myself','we','our','ours','ourselves',
              'you','your','yours','yourself','yourselves','he','him',
              'his','himself','she','her','hers','herself','it','its',
              'itself','they','them','their','theirs','themselves',
              'what','which','who','whom','this','that','these','those',
              'am','is','are','was','were','be','been','being','have',
              'has','had','having','do','does','did','doing','a','an',
              'the','and','but','if','or','because','as','until','while',
              'of','at','by','for','with','about','against','between',
              'into','through','during','before','after','above','below',
              'to','from','up','down','in','out','on','off','over','under',
              'again','further','then','once','here','there','when','where',
              'why','how','all','any','both','each','few','more','most',
              'other','some','such','no','nor','not','only','own','same',
              'so','than','too','very','can','will','just','don','should','now']

In [None]:
%%time
from pyspark.ml.feature import StopWordsRemover
stopwordsRemovalFeature = StopWordsRemover(inputCol="words",
                                           outputCol="words without stop")\
                                           .setStopWords(stop_words)

from pyspark.ml import Pipeline
stopWordRemovalPipeline = Pipeline(stages=[stopwordsRemovalFeature])
pipelineFitRemoveStopWords = stopWordRemovalPipeline.fit(clean_text_df)

clean_text_df = pipelineFitRemoveStopWords.transform(clean_text_df)


In [None]:
clean_text_df.select('words', 'words without stop','label').show(20)

In [None]:
%%time
import pyspark.ml.feature as feat
TF_ = feat.HashingTF(inputCol="words without stop", outputCol="rawFeatures", numFeatures=500)
IDF_ = feat.IDF(inputCol="rawFeatures", outputCol="features")
pipelineTFIDF = Pipeline(stages=[TF_, IDF_])

pipelineFit = pipelineTFIDF.fit(clean_text_df)
clean_text_df = pipelineFit.transform(clean_text_df)


In [None]:
clean_text_df.show(20)

In [None]:
%%time
from pyspark.ml.classification import LogisticRegression

(trainingDF, testDF) = clean_text_df.randomSplit([0.75, 0.25], seed = 7)
logreg = LogisticRegression(regParam=0.25)

logregModel = logreg.fit(trainingDF)
predictionDF = logregModel.transform(testDF)
predictionDF.select('label', 'probability', 'prediction').show(20)

In [None]:
%%time
from sklearn import metrics
actual = predictionDF.select('label').toPandas()
predicted = predictionDF.select('prediction').toPandas()

In [None]:
%%time
print('accuracy score: {}%'.format(round(metrics.accuracy_score(actual, predicted),3)*100))