In [1]:
# verificar que tengan instalado la librer√≠a 'pyspark'
!pip install pyspark



In [2]:
#create spark session
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('nlp').getOrCreate()

In [3]:
df=spark.createDataFrame([(1,'I really liked this movie'),
                         (2,'I would recommend this movie to my friends'),
                         (3,'movie was alright but acting was horrible'),
                         (4,'I am never watching that movie ever again')],
                        ['user_id','review'])

In [4]:
df.show(5,False)

+-------+------------------------------------------+
|user_id|review                                    |
+-------+------------------------------------------+
|1      |I really liked this movie                 |
|2      |I would recommend this movie to my friends|
|3      |movie was alright but acting was horrible |
|4      |I am never watching that movie ever again |
+-------+------------------------------------------+



In [5]:
# Tokenization

In [6]:
from pyspark.ml.feature import Tokenizer

In [7]:
tokenization=Tokenizer(inputCol='review',outputCol='tokens')

In [8]:
tokenized_df=tokenization.transform(df)

In [9]:
tokenized_df.show(4,False)

+-------+------------------------------------------+---------------------------------------------------+
|user_id|review                                    |tokens                                             |
+-------+------------------------------------------+---------------------------------------------------+
|1      |I really liked this movie                 |[i, really, liked, this, movie]                    |
|2      |I would recommend this movie to my friends|[i, would, recommend, this, movie, to, my, friends]|
|3      |movie was alright but acting was horrible |[movie, was, alright, but, acting, was, horrible]  |
|4      |I am never watching that movie ever again |[i, am, never, watching, that, movie, ever, again] |
+-------+------------------------------------------+---------------------------------------------------+



In [10]:
# stopwords removal 

In [11]:
from pyspark.ml.feature import StopWordsRemover

In [12]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [13]:
refined_df=stopword_removal.transform(tokenized_df)

In [14]:
refined_df.select(['user_id','tokens','refined_tokens']).show(10,False)

+-------+---------------------------------------------------+----------------------------------+
|user_id|tokens                                             |refined_tokens                    |
+-------+---------------------------------------------------+----------------------------------+
|1      |[i, really, liked, this, movie]                    |[really, liked, movie]            |
|2      |[i, would, recommend, this, movie, to, my, friends]|[recommend, movie, friends]       |
|3      |[movie, was, alright, but, acting, was, horrible]  |[movie, alright, acting, horrible]|
|4      |[i, am, never, watching, that, movie, ever, again] |[never, watching, movie, ever]    |
+-------+---------------------------------------------------+----------------------------------+



In [15]:
# Movies reviews 

In [16]:
text_df=spark.read.csv('../datasets/movie_reviews.csv',inferSchema=True,header=True,sep=',')

In [17]:
text_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [18]:
text_df.count()

7087

In [19]:
# Data Cleaning

In [20]:
tokenization=Tokenizer(inputCol='Review',outputCol='tokens')

In [21]:
tokenized_df=tokenization.transform(text_df)

In [22]:
tokenized_df.show()

+--------------------+---------+--------------------+
|              Review|Sentiment|              tokens|
+--------------------+---------+--------------------+
|The Da Vinci Code...|        1|[the, da, vinci, ...|
|this was the firs...|        1|[this, was, the, ...|
|i liked the Da Vi...|        1|[i, liked, the, d...|
|i liked the Da Vi...|        1|[i, liked, the, d...|
|I liked the Da Vi...|        1|[i, liked, the, d...|
|that's not even a...|        1|[that's, not, eve...|
|I loved the Da Vi...|        1|[i, loved, the, d...|
|i thought da vinc...|        1|[i, thought, da, ...|
|The Da Vinci Code...|        1|[the, da, vinci, ...|
|I thought the Da ...|        1|[i, thought, the,...|
|The Da Vinci Code...|        1|[the, da, vinci, ...|
|The Da Vinci Code...|        1|[the, da, vinci, ...|
|then I turn on th...|        1|[then, i, turn, o...|
|The Da Vinci Code...|        1|[the, da, vinci, ...|
|i love da vinci c...|        1|[i, love, da, vin...|
|i loved da vinci ...|      

In [23]:
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')

In [24]:
refined_text_df=stopword_removal.transform(tokenized_df)

In [25]:
refined_text_df.show()

+--------------------+---------+--------------------+--------------------+
|              Review|Sentiment|              tokens|      refined_tokens|
+--------------------+---------+--------------------+--------------------+
|The Da Vinci Code...|        1|[the, da, vinci, ...|[da, vinci, code,...|
|this was the firs...|        1|[this, was, the, ...|[first, clive, cu...|
|i liked the Da Vi...|        1|[i, liked, the, d...|[liked, da, vinci...|
|i liked the Da Vi...|        1|[i, liked, the, d...|[liked, da, vinci...|
|I liked the Da Vi...|        1|[i, liked, the, d...|[liked, da, vinci...|
|that's not even a...|        1|[that's, not, eve...|[even, exaggerati...|
|I loved the Da Vi...|        1|[i, loved, the, d...|[loved, da, vinci...|
|i thought da vinc...|        1|[i, thought, da, ...|[thought, da, vin...|
|The Da Vinci Code...|        1|[the, da, vinci, ...|[da, vinci, code,...|
|I thought the Da ...|        1|[i, thought, the,...|[thought, da, vin...|
|The Da Vinci Code...|   

In [26]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import *

In [27]:
len_udf = udf(lambda s: len(s), IntegerType())

refined_text_df = refined_text_df.withColumn("token_count", len_udf(col('refined_tokens')))


In [28]:
refined_text_df.orderBy(rand()).show(10)

+--------------------+---------+--------------------+--------------------+-----------+
|              Review|Sentiment|              tokens|      refined_tokens|token_count|
+--------------------+---------+--------------------+--------------------+-----------+
|The Da Vinci Code...|        0|[the, da, vinci, ...|[da, vinci, code,...|          6|
|by the way, the D...|        0|[by, the, way,, t...|[way,, da, vinci,...|          7|
|i love being a se...|        1|[i, love, being, ...|[love, sentry, mi...|          6|
|friday hung out w...|        0|[friday, hung, ou...|[friday, hung, ke...|          9|
|I want to be here...|        1|[i, want, to, be,...|[want, love, harr...|          7|
|Mission Impossibl...|        1|[mission, impossi...|[mission, impossi...|          4|
|So as felicia's m...|        1|[so, as, felicia'...|[felicia's, mom, ...|          7|
|I want to be here...|        1|[i, want, to, be,...|[want, love, harr...|          7|
|The Da Vinci Code...|        1|[the, da, v