In [1]:
import pandas as pd
from pyspark.sql.functions import when
from pyspark.ml import Pipeline
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.types import IntegerType, ArrayType, BooleanType, StringType
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer,NGram,HashingTF,IDF
from pyspark.sql.functions import concat,col
from pyspark.ml.classification import LogisticRegression,LogisticRegressionModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit,CrossValidator

In [2]:
spark = SparkSession \
    .builder\
    .master('local')\
    .appName('twitter')\
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [3]:
df = spark.read.option("delimiter", ";").option("header", True).csv("Twitter.csv")

In [4]:
dfn=df.dropna()

df1 = dfn.withColumn("Note", when(dfn.sentiment =="Positive" ,1)
                                                    .when(dfn.sentiment=="Negative" ,0))

In [5]:
#Tokenzed
tokenizer = Tokenizer(inputCol="Tweet content", outputCol="words")
tokenized = tokenizer.transform(df1)
tokenized.show()

+--------+-----------+---------+--------------------+----+--------------------+
|Tweet ID|     entity|sentiment|       Tweet content|Note|               words|
+--------+-----------+---------+--------------------+----+--------------------+
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|
|    2401|Borderlands| Positive|I am coming to th...|   1|[i, am, coming, t...|
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|
|    2401|Borderlands| Positive|im coming on bord...|   1|[im, coming, on, ...|
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|
|    2401|Borderlands| Positive|im getting into b...|   1|[im, getting, int...|
|    2402|Borderlands| Positive|So I spent a few ...|   1|[so, i, spent, a,...|
|    2402|Borderlands| Positive|So I spent a coup...|   1|[so, i, spent, a,...|
|    2402|Borderlands| Positive|So I spent a few ...|   1|[so, i, spent, a,...|
|    2402|Borderlands| Positive|So I spe

In [6]:
#Stop Words
remover = StopWordsRemover()
remover.setInputCol("words")
remover.setOutputCol("Resultat")
df= remover.transform(tokenized)
df.show()

+--------+-----------+---------+--------------------+----+--------------------+--------------------+
|Tweet ID|     entity|sentiment|       Tweet content|Note|               words|            Resultat|
+--------+-----------+---------+--------------------+----+--------------------+--------------------+
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|[im, getting, bor...|
|    2401|Borderlands| Positive|I am coming to th...|   1|[i, am, coming, t...|[coming, borders,...|
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|[im, getting, bor...|
|    2401|Borderlands| Positive|im coming on bord...|   1|[im, coming, on, ...|[im, coming, bord...|
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|[im, getting, bor...|
|    2401|Borderlands| Positive|im getting into b...|   1|[im, getting, int...|[im, getting, bor...|
|    2402|Borderlands| Positive|So I spent a few ...|   1|[so, i, spent, a,...|[spent, hour

In [7]:
#Ngram
ngram = NGram(n=2)
ngram.setInputCol("Resultat")
ngram.setOutputCol("Ngram")
ngramDataFrame = ngram.transform(df)


In [8]:
#Hashing
hashingTF = HashingTF(inputCol="Ngram", outputCol="features")
hashingTF.setNumFeatures(2)
ls=hashingTF.transform(ngramDataFrame)

In [9]:
#IDF
idf = IDF(inputCol="features", outputCol="idf")
idfModel = idf.fit(ls)
rescaledData = idfModel.transform(ls)

In [10]:
#LogisticRegression
lr = LogisticRegression(featuresCol='idf',
    labelCol='Note')

In [11]:
train,test=rescaledData.randomSplit([0.8,0.2],seed=12345)
lrModel = lr.fit(train)
lrModel=lrModel.transform(test)

In [12]:
# Fit the model
# lrModel = lr.fit(rescaledData)
# lrModel=lrModel.transform(rescaledData)

In [13]:
evaluator2 = BinaryClassificationEvaluator(labelCol="Note", rawPredictionCol="idf", metricName='areaUnderROC')
evaluator2.evaluate(lrModel)

0.4497367491240839

In [14]:
lrModel.show(500)

+--------+--------------------+---------+--------------------+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|Tweet ID|              entity|sentiment|       Tweet content|Note|               words|            Resultat|               Ngram|            features|                 idf|       rawPrediction|         probability|prediction|
+--------+--------------------+---------+--------------------+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    1000|      AssassinsCreed| Positive|4 Won't have any ...|   1|[4, won't, have, ...|[4, real, problem...|[4 real, real pro...|(2,[0,1],[17.0,14...|(2,[0,1],[2.13652...|[0.50453712793139...|[0.62352497919181...|       0.0|
|   10004|PlayerUnknownsBat...| Positive|"""Deleted!"" ""G...|   1|["""deleted!"", "...|["""dele

In [15]:
pipeline = Pipeline(stages=[tokenizer,remover,ngram, hashingTF, idf , lr])
model_pip = pipeline.fit(df1)

In [16]:
model_pip.write().overwrite().save("model_pip")

In [17]:
# dfn = spark.read.option("delimiter", ";").option("header", True).csv("test.csv")

In [18]:
dfn.show()

+--------+-----------+---------+--------------------+
|Tweet ID|     entity|sentiment|       Tweet content|
+--------+-----------+---------+--------------------+
|    2401|Borderlands| Positive|im getting on bor...|
|    2401|Borderlands| Positive|I am coming to th...|
|    2401|Borderlands| Positive|im getting on bor...|
|    2401|Borderlands| Positive|im coming on bord...|
|    2401|Borderlands| Positive|im getting on bor...|
|    2401|Borderlands| Positive|im getting into b...|
|    2402|Borderlands| Positive|So I spent a few ...|
|    2402|Borderlands| Positive|So I spent a coup...|
|    2402|Borderlands| Positive|So I spent a few ...|
|    2402|Borderlands| Positive|So I spent a few ...|
|    2402|Borderlands| Positive|2010 So I spent a...|
|    2402|Borderlands| Positive|                 was|
|    2404|Borderlands| Positive|that was the firs...|
|    2404|Borderlands| Positive|this was the firs...|
|    2404|Borderlands| Positive|that was the firs...|
|    2404|Borderlands| Posit

In [19]:
dff = model_pip.transform(df1)

df1.select('Note').show(500)

+----+
|Note|
+----+
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   0|
|   0|
|   0|
|   0|
|   0|
|   0|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   0|
|   0|

In [20]:
dff.select('prediction').show((500))

+----------+
|prediction|
+----------+
|       1.0|
|       1.0|
|       1.0|
|       1.0|
|       1.0|
|       1.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       1.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       1.0|
|       1.0|
|       0.0|
|       0.0|
|       0.0|
|       1.0|
|       1.0|
|       1.0|
|       1.0|
|       1.0|
|       1.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       1.0|
|       0.0|
|       0.0|
|       1.0|
|       0.0|
|       1.0|
|       1.0|
|       1.0|