In [1]:
#configuración en google colab de spark y pyspark
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/f0/26/198fc8c0b98580f617cb03cb298c6056587b8f0447e20fa40c5b634ced77/pyspark-3.0.1.tar.gz (204.2MB)
[K     |████████████████████████████████| 204.2MB 64kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 47.5MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612243 sha256=239777c22434e92fdb36e494e9f264e35779d535e951d708248d5cbb37f044cf
  Stored in directory: /root/.cache/pip/wheels/5e/bd/07/031766ca628adec8435bb40f0bd83bb676ce65ff4007f8e73f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [4]:
#create spark session
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('nlp').getOrCreate()

In [5]:
df=spark.createDataFrame([(1,'I really liked this movie'),
                         (2,'I would recommend this movie to my friends'),
                         (3,'movie was alright but acting was horrible'),
                         (4,'I am never watching that movie ever again')],
                        ['user_id','review'])

In [6]:
df.show(5,False)

+-------+------------------------------------------+
|user_id|review                                    |
+-------+------------------------------------------+
|1      |I really liked this movie                 |
|2      |I would recommend this movie to my friends|
|3      |movie was alright but acting was horrible |
|4      |I am never watching that movie ever again |
+-------+------------------------------------------+



In [7]:
# Tokenization

In [8]:
from pyspark.ml.feature import Tokenizer

In [9]:
tokenization=Tokenizer(inputCol='review',outputCol='tokens')

In [10]:
tokenized_df=tokenization.transform(df)

In [11]:
tokenized_df.show(4,False)

+-------+------------------------------------------+---------------------------------------------------+
|user_id|review                                    |tokens                                             |
+-------+------------------------------------------+---------------------------------------------------+
|1      |I really liked this movie                 |[i, really, liked, this, movie]                    |
|2      |I would recommend this movie to my friends|[i, would, recommend, this, movie, to, my, friends]|
|3      |movie was alright but acting was horrible |[movie, was, alright, but, acting, was, horrible]  |
|4      |I am never watching that movie ever again |[i, am, never, watching, that, movie, ever, again] |
+-------+------------------------------------------+---------------------------------------------------+



In [12]:
# stopwords removal 

In [13]:
from pyspark.ml.feature import StopWordsRemover

In [14]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [15]:
refined_df=stopword_removal.transform(tokenized_df)

In [16]:
refined_df.select(['user_id','tokens','refined_tokens']).show(10,False)

+-------+---------------------------------------------------+----------------------------------+
|user_id|tokens                                             |refined_tokens                    |
+-------+---------------------------------------------------+----------------------------------+
|1      |[i, really, liked, this, movie]                    |[really, liked, movie]            |
|2      |[i, would, recommend, this, movie, to, my, friends]|[recommend, movie, friends]       |
|3      |[movie, was, alright, but, acting, was, horrible]  |[movie, alright, acting, horrible]|
|4      |[i, am, never, watching, that, movie, ever, again] |[never, watching, movie, ever]    |
+-------+---------------------------------------------------+----------------------------------+



In [17]:
# Count Vectorizer

In [18]:
from pyspark.ml.feature import CountVectorizer

In [19]:
count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features')

In [20]:
cv_df=count_vec.fit(refined_df).transform(refined_df)

In [21]:
cv_df.select(['user_id','refined_tokens','features']).show(4,False)

+-------+----------------------------------+---------------------------------+
|user_id|refined_tokens                    |features                         |
+-------+----------------------------------+---------------------------------+
|1      |[really, liked, movie]            |(11,[0,2,3],[1.0,1.0,1.0])       |
|2      |[recommend, movie, friends]       |(11,[0,6,7],[1.0,1.0,1.0])       |
|3      |[movie, alright, acting, horrible]|(11,[0,1,5,10],[1.0,1.0,1.0,1.0])|
|4      |[never, watching, movie, ever]    |(11,[0,4,8,9],[1.0,1.0,1.0,1.0]) |
+-------+----------------------------------+---------------------------------+



In [22]:
count_vec.fit(refined_df).vocabulary

['movie',
 'recommend',
 'ever',
 'never',
 'really',
 'acting',
 'horrible',
 'liked',
 'watching',
 'alright',
 'friends']

In [23]:
#Tf-idf

In [28]:
from pyspark.ml.feature import HashingTF,IDF

In [30]:
hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features',numFeatures=100)

In [31]:
hashing_df=hashing_vec.transform(refined_df)

In [32]:
hashing_df.select(['user_id','refined_tokens','tf_features']).show(4,False)

+-------+----------------------------------+-------------------------------------+
|user_id|refined_tokens                    |tf_features                          |
+-------+----------------------------------+-------------------------------------+
|1      |[really, liked, movie]            |(100,[12,39,88],[1.0,1.0,1.0])       |
|2      |[recommend, movie, friends]       |(100,[16,39,99],[1.0,1.0,1.0])       |
|3      |[movie, alright, acting, horrible]|(100,[5,23,39,66],[1.0,1.0,1.0,1.0]) |
|4      |[never, watching, movie, ever]    |(100,[39,75,81,94],[1.0,1.0,1.0,1.0])|
+-------+----------------------------------+-------------------------------------+



In [33]:
tf_idf_vec=IDF(inputCol='tf_features',outputCol='tf_idf_features')

In [34]:
tf_idf_df=tf_idf_vec.fit(hashing_df).transform(hashing_df)

In [35]:
tf_idf_df.select(['user_id','tf_idf_features']).show(4,False)

+-------+----------------------------------------------------------------------------------+
|user_id|tf_idf_features                                                                   |
+-------+----------------------------------------------------------------------------------+
|1      |(100,[12,39,88],[0.9162907318741551,0.0,0.9162907318741551])                      |
|2      |(100,[16,39,99],[0.9162907318741551,0.0,0.9162907318741551])                      |
|3      |(100,[5,23,39,66],[0.9162907318741551,0.9162907318741551,0.0,0.9162907318741551]) |
|4      |(100,[39,75,81,94],[0.0,0.9162907318741551,0.9162907318741551,0.9162907318741551])|
+-------+----------------------------------------------------------------------------------+



In [None]:
# Classification 

In [36]:
text_df=spark.read.csv('/content/gdrive/My\ Drive/github/ari20202/datasets/movie_reviews.csv',inferSchema=True,header=True,sep=',')

In [37]:
text_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [38]:
text_df.count()

7087

In [39]:
from pyspark.sql.functions import rand 

In [40]:
text_df.orderBy(rand()).show(10,False)

+---------------------------------------------------------------------------+---------+
|Review                                                                     |Sentiment|
+---------------------------------------------------------------------------+---------+
|Then snuck into Brokeback Mountain, which is the most depressing movie I   |0        |
|man i loved brokeback mountain!                                            |1        |
|I OFFICIALLY * HATE * BROKEBACK MOUNTAIN!!!!!!!!!!!                        |0        |
|"I love the Harry Potter series if you can count that as "" a "" book      | also"   |
|I'm telling you, the Da Vinci Code is an AWESOME book!                     |1        |
|I hate Harry Potter..                                                      |0        |
|""" Brokeback Mountain "" was a beautiful movie and it haunted me for days"|1        |
|If you don't know that I love reading Harry Potter, you probably have ne   |1        |
|Harry Potter will suck you in l

In [41]:
text_df=text_df.filter(((text_df.Sentiment =='1') | (text_df.Sentiment =='0')))

In [42]:
text_df.count()

6990

In [43]:
text_df.groupBy('Sentiment').count().show()

+---------+-----+
|Sentiment|count|
+---------+-----+
|        0| 3081|
|        1| 3909|
+---------+-----+



In [44]:
text_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [45]:
text_df = text_df.withColumn("Label", text_df.Sentiment.cast('float')).drop('Sentiment')

In [46]:
text_df.orderBy(rand()).show(10,False)

+------------------------------------------------------------------------+-----+
|Review                                                                  |Label|
+------------------------------------------------------------------------+-----+
|Ok brokeback mountain is such a horrible movie.                         |0.0  |
|Brokeback Mountain is fucking horrible..                                |0.0  |
|Ok brokeback mountain is such a horrible movie.                         |0.0  |
|I love Harry Potter.                                                    |1.0  |
|I love Brokeback Mountain....                                           |1.0  |
|I enjoy reading Harry Potter.                                           |1.0  |
|i loved the da Vinci code, even though it was the second book in the ser|1.0  |
|"Anyway, thats why I love "" Brokeback Mountain."                       |1.0  |
|Brokeback Mountain was boring.                                          |0.0  |
|we're gonna like watch Miss

In [47]:
text_df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  1.0| 3909|
|  0.0| 3081|
+-----+-----+



In [48]:
# Add length to the dataframe
from pyspark.sql.functions import length

In [49]:
text_df=text_df.withColumn('length',length(text_df['Review']))

In [50]:
text_df.orderBy(rand()).show(10,False)

+------------------------------------------------------------------------+-----+------+
|Review                                                                  |Label|length|
+------------------------------------------------------------------------+-----+------+
|Brokeback Mountain is an excellent movie, I love it after watching it!  |1.0  |70    |
|by the way, the Da Vinci Code sucked, just letting you know...          |0.0  |62    |
|Brokeback Mountain is packed with beautiful sceneries and such.         |1.0  |63    |
|This quiz sucks and Harry Potter sucks ok bye..                         |0.0  |47    |
|Because I would like to make friends who like the same things I like, an|1.0  |72    |
|da vinci code sucks...                                                  |0.0  |22    |
|da vinci code sucked, IMO.                                              |0.0  |26    |
|da vinci code sucks...                                                  |0.0  |22    |
|I love Harry Potter..          

In [51]:
text_df.groupBy('Label').agg({'Length':'mean'}).show()

+-----+-----------------+
|Label|      avg(Length)|
+-----+-----------------+
|  1.0|47.61882834484523|
|  0.0|50.95845504706264|
+-----+-----------------+



In [52]:
# Data Cleaning

In [53]:
tokenization=Tokenizer(inputCol='Review',outputCol='tokens')

In [54]:
tokenized_df=tokenization.transform(text_df)

In [55]:
tokenized_df.show()

+--------------------+-----+------+--------------------+
|              Review|Label|length|              tokens|
+--------------------+-----+------+--------------------+
|The Da Vinci Code...|  1.0|    39|[the, da, vinci, ...|
|this was the firs...|  1.0|    72|[this, was, the, ...|
|i liked the Da Vi...|  1.0|    32|[i, liked, the, d...|
|i liked the Da Vi...|  1.0|    32|[i, liked, the, d...|
|I liked the Da Vi...|  1.0|    72|[i, liked, the, d...|
|that's not even a...|  1.0|    72|[that's, not, eve...|
|I loved the Da Vi...|  1.0|    72|[i, loved, the, d...|
|i thought da vinc...|  1.0|    57|[i, thought, da, ...|
|The Da Vinci Code...|  1.0|    45|[the, da, vinci, ...|
|I thought the Da ...|  1.0|    51|[i, thought, the,...|
|The Da Vinci Code...|  1.0|    68|[the, da, vinci, ...|
|The Da Vinci Code...|  1.0|    62|[the, da, vinci, ...|
|then I turn on th...|  1.0|    66|[then, i, turn, o...|
|The Da Vinci Code...|  1.0|    34|[the, da, vinci, ...|
|i love da vinci c...|  1.0|   

In [56]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [57]:
refined_text_df=stopword_removal.transform(tokenized_df)

In [58]:
refined_text_df.show()

+--------------------+-----+------+--------------------+--------------------+
|              Review|Label|length|              tokens|      refined_tokens|
+--------------------+-----+------+--------------------+--------------------+
|The Da Vinci Code...|  1.0|    39|[the, da, vinci, ...|[da, vinci, code,...|
|this was the firs...|  1.0|    72|[this, was, the, ...|[first, clive, cu...|
|i liked the Da Vi...|  1.0|    32|[i, liked, the, d...|[liked, da, vinci...|
|i liked the Da Vi...|  1.0|    32|[i, liked, the, d...|[liked, da, vinci...|
|I liked the Da Vi...|  1.0|    72|[i, liked, the, d...|[liked, da, vinci...|
|that's not even a...|  1.0|    72|[that's, not, eve...|[even, exaggerati...|
|I loved the Da Vi...|  1.0|    72|[i, loved, the, d...|[loved, da, vinci...|
|i thought da vinc...|  1.0|    57|[i, thought, da, ...|[thought, da, vin...|
|The Da Vinci Code...|  1.0|    45|[the, da, vinci, ...|[da, vinci, code,...|
|I thought the Da ...|  1.0|    51|[i, thought, the,...|[thought

In [59]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import *

In [60]:
len_udf = udf(lambda s: len(s), IntegerType())

refined_text_df = refined_text_df.withColumn("token_count", len_udf(col('refined_tokens')))


In [61]:
refined_text_df.orderBy(rand()).show(10)

+--------------------+-----+------+--------------------+--------------------+-----------+
|              Review|Label|length|              tokens|      refined_tokens|token_count|
+--------------------+-----+------+--------------------+--------------------+-----------+
|Brokeback Mountai...|  1.0|    31|[brokeback, mount...|[brokeback, mount...|          3|
|the story of Harr...|  1.0|    72|[the, story, of, ...|[story, harry, po...|          9|
|the people who ar...|  1.0|    67|[the, people, who...|[people, worth, k...|          8|
|dudeee i LOVED br...|  1.0|    37|[dudeee, i, loved...|[dudeee, loved, b...|          4|
|The Da Vinci Code...|  1.0|    30|[the, da, vinci, ...|[da, vinci, code,...|          4|
|DA VINCI CODE IS ...|  1.0|    26|[da, vinci, code,...|[da, vinci, code,...|          4|
|i heard da vinci ...|  0.0|    53|[i, heard, da, vi...|[heard, da, vinci...|          9|
|Which is why i sa...|  1.0|    72|[which, is, why, ...|[said, silent, hi...|          8|
|Da Vinci 

In [62]:
count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features')

In [63]:
cv_text_df=count_vec.fit(refined_text_df).transform(refined_text_df)

In [64]:
cv_text_df.select(['refined_tokens','token_count','features','Label']).show(10)

+--------------------+-----------+--------------------+-----+
|      refined_tokens|token_count|            features|Label|
+--------------------+-----------+--------------------+-----+
|[da, vinci, code,...|          5|(2302,[0,1,4,43,2...|  1.0|
|[first, clive, cu...|          9|(2302,[11,51,229,...|  1.0|
|[liked, da, vinci...|          5|(2302,[0,1,4,53,3...|  1.0|
|[liked, da, vinci...|          5|(2302,[0,1,4,53,3...|  1.0|
|[liked, da, vinci...|          8|(2302,[0,1,4,53,6...|  1.0|
|[even, exaggerati...|          6|(2302,[46,229,271...|  1.0|
|[loved, da, vinci...|          8|(2302,[0,1,22,30,...|  1.0|
|[thought, da, vin...|          7|(2302,[0,1,4,228,...|  1.0|
|[da, vinci, code,...|          6|(2302,[0,1,4,33,2...|  1.0|
|[thought, da, vin...|          7|(2302,[0,1,4,223,...|  1.0|
+--------------------+-----------+--------------------+-----+
only showing top 10 rows



In [65]:
#select data for building model
model_text_df=cv_text_df.select(['features','token_count','Label'])

In [66]:
from pyspark.ml.feature import VectorAssembler

In [67]:
df_assembler = VectorAssembler(inputCols=['features','token_count'],outputCol='features_vec')
model_text_df = df_assembler.transform(model_text_df)

In [68]:
model_text_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- token_count: integer (nullable = true)
 |-- Label: float (nullable = true)
 |-- features_vec: vector (nullable = true)



In [69]:
from pyspark.ml.classification import LogisticRegression

In [70]:
#split the data 
training_df,test_df=model_text_df.randomSplit([0.75,0.25])

In [71]:
training_df.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0| 2955|
|  0.0| 2338|
+-----+-----+



In [72]:
test_df.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0|  954|
|  0.0|  743|
+-----+-----+



In [73]:
log_reg=LogisticRegression(featuresCol='features_vec',labelCol='Label').fit(training_df)

In [74]:
results=log_reg.evaluate(test_df).predictions

In [76]:
results.select(['Label','probability','prediction']).show()

+-----+--------------------+----------+
|Label|         probability|prediction|
+-----+--------------------+----------+
|  1.0|[2.43303938046421...|       1.0|
|  1.0|[2.15960909585201...|       1.0|
|  0.0|[0.99999999999975...|       0.0|
|  1.0|[3.05437097306203...|       1.0|
|  1.0|[2.94912250898419...|       1.0|
|  1.0|[4.06604451841251...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
|  1.0|[4.03440976136216...|       1.0|
+-----+--------------------+----------+
only showing top 20 rows



In [77]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [78]:
#confusion matrix
true_postives = results[(results.Label == 1) & (results.prediction == 1)].count()
true_negatives = results[(results.Label == 0) & (results.prediction == 0)].count()
false_positives = results[(results.Label == 0) & (results.prediction == 1)].count()
false_negatives = results[(results.Label == 1) & (results.prediction == 0)].count()

In [79]:
recall = float(true_postives)/(true_postives + false_negatives)
print(recall)

0.9853249475890985


In [80]:
precision = float(true_postives) / (true_postives + false_positives)
print(precision)

0.9690721649484536


In [81]:
accuracy=float((true_postives+true_negatives) /(results.count()))
print(accuracy)

0.9740718915733647
