In [1]:
pip install nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 1.7 MB/s eta 0:00:01     |█████████████▋                  | 634 kB 1.7 MB/s eta 0:00:01
Collecting regex>=2021.8.3
  Downloading regex-2022.3.15-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (764 kB)
[K     |████████████████████████████████| 764 kB 1.4 MB/s eta 0:00:01
Installing collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.3.15
Note: you may need to restart the kernel to use updated packages.


In [2]:
#imports
import pandas as pd
from pyspark.sql.functions import when
from pyspark.ml import Pipeline
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.types import IntegerType, ArrayType, BooleanType, StringType
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, RegexTokenizer,NGram,HashingTF,IDF
from pyspark.sql.functions import concat,col
from pyspark.ml.classification import LogisticRegression,LogisticRegressionModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit,CrossValidator
import nltk.corpus 
from nltk.corpus import stopwords

In [3]:
#lancement de la session spark
spark = SparkSession \
    .builder\
    .master('local')\
    .appName('twitter')\
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [4]:
#creation de la df a partir du csv twitter
df = spark.read.option("delimiter", ";").option("header", True).csv("Twitter.csv")

In [5]:
#nettoyage de la df
dfn=df.dropna()

In [6]:
#Creation de la colonne Note
df1 = dfn.withColumn("Note", when(dfn.sentiment =="Positive" ,1)
                                                    .when(dfn.sentiment=="Negative" ,0))

In [7]:
#Tokenzed
tokenizer = Tokenizer(inputCol="Tweet content", outputCol="words")
tokenized = tokenizer.transform(df1)
tokenized.show()

+--------+-----------+---------+--------------------+----+--------------------+
|Tweet ID|     entity|sentiment|       Tweet content|Note|               words|
+--------+-----------+---------+--------------------+----+--------------------+
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|
|    2401|Borderlands| Positive|I am coming to th...|   1|[i, am, coming, t...|
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|
|    2401|Borderlands| Positive|im coming on bord...|   1|[im, coming, on, ...|
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|
|    2401|Borderlands| Positive|im getting into b...|   1|[im, getting, int...|
|    2402|Borderlands| Positive|So I spent a few ...|   1|[so, i, spent, a,...|
|    2402|Borderlands| Positive|So I spent a coup...|   1|[so, i, spent, a,...|
|    2402|Borderlands| Positive|So I spent a few ...|   1|[so, i, spent, a,...|
|    2402|Borderlands| Positive|So I spe

In [8]:
#Telechargement des stop word de la bib nltk
nltk.download('stopwords')
add_stopwords= stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
#Stop Words
remover = StopWordsRemover(stopWords=add_stopwords)
remover.setInputCol("words")
remover.setOutputCol("Resultat")
df= remover.transform(tokenized)
df.show()

+--------+-----------+---------+--------------------+----+--------------------+--------------------+
|Tweet ID|     entity|sentiment|       Tweet content|Note|               words|            Resultat|
+--------+-----------+---------+--------------------+----+--------------------+--------------------+
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|[im, getting, bor...|
|    2401|Borderlands| Positive|I am coming to th...|   1|[i, am, coming, t...|[coming, borders,...|
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|[im, getting, bor...|
|    2401|Borderlands| Positive|im coming on bord...|   1|[im, coming, on, ...|[im, coming, bord...|
|    2401|Borderlands| Positive|im getting on bor...|   1|[im, getting, on,...|[im, getting, bor...|
|    2401|Borderlands| Positive|im getting into b...|   1|[im, getting, int...|[im, getting, bor...|
|    2402|Borderlands| Positive|So I spent a few ...|   1|[so, i, spent, a,...|[spent, hour

In [10]:
#Ngram
ngram = NGram(n=2)
ngram.setInputCol("Resultat")
ngram.setOutputCol("Ngram")
ngramDataFrame = ngram.transform(df)
ngramDataFrame=ngramDataFrame.na.drop()

In [11]:
#Hashing
hashingTF = HashingTF(inputCol="Ngram", outputCol="features")
hashing=hashingTF.transform(ngramDataFrame)

In [13]:
#IDF
idf = IDF(inputCol="features", outputCol="idf")
idfModel = idf.fit(hashing)
df_idf = idfModel.transform(hashing)

In [14]:
#LogisticRegression
lr = LogisticRegression(featuresCol='idf',
    labelCol='Note')

In [15]:
#Split sur la df
train,test=df_idf.randomSplit([0.8,0.2],seed=12345)
lrModel = lr.fit(train)
out=lrModel.transform(test)

In [16]:
#BinaryClassificationEvaluator
evaluator2 = BinaryClassificationEvaluator(labelCol="Note", rawPredictionCol="prediction", metricName='areaUnderROC')
evaluator2.evaluate(out)

0.878396819620521

In [17]:
#Initialisation de la pipeline
pipeline = Pipeline(stages=[tokenizer,remover,ngram, hashingTF, idf , lr])
#entrainement de la pip 
model_pip = pipeline.fit(df1)

In [18]:
#Save de la pipeline
model_pip.write().overwrite().save("model_pip")