In [None]:
#configuración en google colab de spark y pyspark
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#instalar java y spark
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.2-bin-hadoop3"

import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

In [None]:
from pyspark.sql import SparkSession

#forma 1 de crear la sesión y el contexto Spark:
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

#forma 2 de crear la sesión y el contexto Spark:
#sc = SparkContext.getOrCreate()
#spark=SparkSession.builder.appName('nlp').getOrCreate()

In [None]:
#myrdd = sc.wholeTextFiles('../datasets/papers_sample_pdf/*.txt')
#df = myrdd.toDF(schema=['filename','content'])
#df.show(5)

In [None]:
df=spark.createDataFrame([(1,'I really liked this movie'),
                         (2,'I would recommend this movie to my friends'),
                         (3,'movie was alright but acting was horrible'),
                         (4,'I am never watching that movie ever again')],
                        ['user_id','content'])

In [None]:
df.printSchema()

In [None]:
# Tokenization
from pyspark.ml.feature import Tokenizer
tokenization=Tokenizer(inputCol='content',outputCol='tokens')
tokenized_df=tokenization.transform(df)
tokenized_df.printSchema()
tokenized_df.show(5)


In [None]:
# stopwords removal 
from pyspark.ml.feature import StopWordsRemover
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')
refined_df=stopword_removal.transform(tokenized_df)
refined_df.select(['tokens','refined_tokens']).show(10,False)

In [None]:
refined_df.columns

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import *

In [None]:
len_udf = udf(lambda s: len(s), IntegerType())

refined_count_df = refined_df.withColumn("token_count", len_udf(col('refined_tokens')))


In [None]:
refined_count_df.orderBy(rand()).show(10)

In [None]:
# Count Vectorizer
from pyspark.ml.feature import CountVectorizer
count_vec=CountVectorizer(inputCol='refined_tokens',outputCol='features')
cv_df=count_vec.fit(refined_df).transform(refined_df)
cv_df.select(['refined_tokens','features']).show(4,False)
bow = count_vec.fit(refined_df).vocabulary
print(bow)


In [None]:
# TF with HashingTF
from pyspark.ml.feature import HashingTF
# podria utilizar numFeatures como el tamaño del Bag of Words:
l = len(bow)
hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features',numFeatures=l)
#hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features',numFeatures=11)
# compare la salida e interprete con y sin numFeatures:
#hashing_vec=HashingTF(inputCol='refined_tokens',outputCol='tf_features')

hashing_df=hashing_vec.transform(refined_df)
hashing_df.show(4)



In [None]:
from pyspark.ml.feature import IDF
tf_idf_vec=IDF(inputCol='tf_features',outputCol='tf_idf_features')
tf_idf_df=tf_idf_vec.fit(hashing_df).transform(hashing_df)
tf_idf_df.show(4,False)