In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame

In [2]:
spark = SparkSession.builder.getOrCreate()
spark.conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")

In [4]:
df_cluster = spark.read.parquet("gs://msca-bdp-project-goodreads/clustering_desc_v1.parquet")

In [25]:
#df_cluster.select('description').count()

                                                                                

1573789

In [5]:
#df_cluster.show()

[Stage 4:>                                                          (0 + 1) / 1]

+--------+--------------------+
| book_id|         description|
+--------+--------------------+
|26331592|"Some people say,...|
| 2307748|Many of the world...|
|13022011|When The Clyde Ra...|
|13249442|Koskinen had retu...|
|13171109|Paul and Angela K...|
|   59992|Wolverine's vacat...|
| 1186942|For anyone lookin...|
| 1123543|Experience the Ma...|
| 1123542|The noted artist ...|
|  370426|One of the more i...|
|25175834|Dirty
A poem by B...|
| 1123549|"I am Johannes Ve...|
|  370429|Henry David Thore...|
|35577588|Exploding from th...|
|17833502|Rand figures out ...|
|16137105|Latoa County was ...|
| 6676766|Ghostly diners, v...|
| 6676765|From Brierley Hil...|
|  340853|In this luminous ...|
|  340854|In the soothing d...|
+--------+--------------------+
only showing top 20 rows



                                                                                

In [5]:
#df_cluster.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- description: string (nullable = true)



In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

def remove_urls(text):
    import re
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub('', text)

def remove_symbols(text):
    symbols_to_remove = ['.', ',', '!', '@', '-', "'", '"', '*', '?', '~', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
    for symbol in symbols_to_remove:
        text = text.replace(symbol, '')
    return text

remove_urls_udf = udf(remove_urls, StringType())
df_cluster = df_cluster.withColumn("desc_wo_urls", remove_urls_udf(col("description")))

remove_symbols_udf = udf(remove_symbols, StringType())
df_cluster = df_cluster.withColumn("desc_wo_symbols", remove_symbols_udf(col("desc_wo_urls")))

to_drop = ['desc_wo_urls', 'description']
df_cluster = df_cluster.drop(*to_drop)

In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Tokenize and remove stop words
tokenizer = Tokenizer(inputCol="desc_wo_symbols", outputCol="words")
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# Create a pipeline
pipeline = Pipeline(stages=[tokenizer, stop_words_remover])

# Fit the pipeline to the DataFrame
pipeline_model = pipeline.fit(df_cluster)
df_cluster = pipeline_model.transform(df_cluster)

# Hashing Term Frequency (TF)
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="hashing_tf_features")
tf_data = hashing_tf.transform(df_cluster)

# Inverse Document Frequency (IDF)
idf = IDF(inputCol="hashing_tf_features", outputCol="features")
idf_model = idf.fit(tf_data)
vectorized_data = idf_model.transform(tf_data)

                                                                                

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, RegexTokenizer, CountVectorizer
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re
from pyspark.ml.evaluation import ClusteringEvaluator

# 5. Unsupervised Clustering (KMeans)
kmeans = KMeans(featuresCol="features", k=7, predictionCol="cluster")
pipeline = Pipeline(stages=[hashing_tf, idf, kmeans])
model = pipeline.fit(df_cluster)
clustered_data = model.transform(df_cluster)

evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="cluster", metricName="silhouette")
silhouette = evaluator.evaluate(clustered_data)

23/11/26 22:33:47 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/26 22:34:52 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/26 22:34:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/26 22:34:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/26 22:34:53 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/26 22:34:54 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/26 22:34:54 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/26 22:34:57 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/26 22:34:58 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary w

In [18]:
#print(f"Silhouette Score: {silhouette}")

Silhouette Score: 0.1160237163747209


In [10]:
from pyspark.sql.functions import count

#clustered_data.select('cluster').distinct().show()

In [19]:
#clustered_data.groupby("cluster").agg(count("*").alias("cluster_count")).show()

23/11/26 23:03:59 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 22.1 MiB
23/11/26 23:04:50 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 22.1 MiB
23/11/26 23:04:51 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 22.1 MiB


+-------+-------------+
|cluster|cluster_count|
+-------+-------------+
|      1|          475|
|      6|            8|
|      3|          522|
|      5|       321466|
|      4|           17|
|      8|            1|
|      7|          146|
|      2|         5517|
|      0|      1245640|
+-------+-------------+



23/11/26 23:04:51 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 22.1 MiB


In [None]:
from pyspark.sql.functions import when

clustered_data = clustered_data.withColumn("Genre", when(clustered_data["cluster"] == 0, "Romance")
                                          .when(clustered_data["cluster"] == 1, "Literature & Education")
                                          .when(clustered_data["cluster"] == 2, "Religion & Inspirational")
                              .when(clustered_data["cluster"] == 3, "Science Fiction & Fantasy")
                              .when(clustered_data["cluster"] == 4, "Crime & Mystery")
                              .when(clustered_data["cluster"] == 5, "Romance")
                              .when(clustered_data["cluster"] == 6, "Biography & Memoir")
                                .when(clustered_data["cluster"] == 7, "Others"))

In [None]:
#0 - superhero/scif
#1 - history/education
#2 - thriller/crime/mystery
#3 - biography
#4 - religion/spiritual
#5 - self-help
#6 - comedy
#7 - other (adventure/cooking)

Crime/Thriller/Mystery/Action
History/Education/Literature
Spiritual/Self Help
Comedy/Satire
Romance
Others
Autobiograph/Biography/Memoir
Sci-Fi

In [10]:
final_cluster_df = clustered_data.select('book_id', 'Genre')

In [11]:
final_cluster_df.write.format("bigquery").option("temporaryGcsBucket","msca-bdp-project-goodreads").option("table", "msca-bdp-student-ap.Goodreads_Project.genres_cluster").mode("overwrite").save()

23/11/26 21:30:36 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 20.3 MiB
                                                                                

In [20]:
book_genres = spark.read.format("bigquery").option("table", "msca-bdp-student-ap.Goodreads_Project.genres_cluster").load()

In [24]:
book_genres.select('Genre').distinct().show(truncate=False)

+----------------------+
|Genre                 |
+----------------------+
|History/Education     |
|Religion/Spiritual    |
|Biograph              |
|Thriller/Crime/Mystery|
|Self Help             |
|Cooking/Other Hobbies |
|Comedy                |
|Sci-Fi                |
+----------------------+



In [26]:
from pyspark.sql.functions import when

book_genres = book_genres.withColumn("Genre", when(book_genres["Genre"] == 'Comedy', "Romance")
                                          .when(book_genres["Genre"] == 'History/Education', "Literature & Education")
                                          .when(book_genres["Genre"] == 'Religion/Spiritual', "Religion & Inspirational")
                              .when(book_genres["Genre"] == 'Sci-Fi', "Science Fiction & Fantasy")
                              .when(book_genres["Genre"] == 'Thriller/Crime/Mystery', "Crime & Mystery")
                              .when(book_genres["Genre"] == 'Self Help', "Romance")
                              .when(book_genres["Genre"] == 'Biograph', "Biography & Memoir")
                                .when(book_genres["Genre"] == 'Cooking/Other Hobbies', "Others"))