# Code cải thiện
- 1 : Thay TF-IDF thành Word2Vec

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, Word2Vec
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [2]:
# 1. Initialize Spark Session
spark = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()

# 2. Load Data
data_path = "D:/Study/NLP/Lab/week4/sentiments.csv"  
df = spark.read.csv(data_path, header=True, inferSchema=True)
# Convert -1/1 labels to 0/1: Normalize sentiment labels
df = df.withColumn("label", (col("sentiment").cast("integer") + 1) / 2)
# Drop rows with null sentiment values before processing
initial_row_count = df.count()
df = df.dropna(subset=["sentiment"])
print(f"Loaded {initial_row_count} rows initially, dropped {initial_row_count - df.count()} null rows, final count: {df.count()}")

# Split the data into training and test sets
trainingData, testData = df.randomSplit([0.8, 0.2], seed=42)

# 3. Build Preprocessing Pipeline
# Tokenizer: Splits text into words (tokens).
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# StopWordsRemover: Removes common stop words from the token list.
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# Word2Vec: Learns word embeddings and averages them to create a document vector.
word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="filtered_words", outputCol="features")

# 4. Train the Model
# LogisticRegression: The model used for classification.
lr = LogisticRegression(maxIter=10, regParam=0.001, featuresCol="features", labelCol="label")

# Assemble the Pipeline: All steps are combined into a single Pipeline.
pipeline = Pipeline(stages=[tokenizer, stopwordsRemover, word2Vec, lr])
# Training: Call pipeline.fit() on the training data.
model = pipeline.fit(trainingData)

# 5. Evaluate the Model
# Use model.transform() on the test data to get predictions.
predictions = model.transform(testData)

# MulticlassClassificationEvaluator is used to calculate metrics like accuracy and f1.
evaluator_accuracy = MulticlassClassificationEvaluator(metricName="accuracy", labelCol="label", predictionCol="prediction")
accuracy = evaluator_accuracy.evaluate(predictions)
print(f"Accuracy: {accuracy:.4f}")

evaluator_f1 = MulticlassClassificationEvaluator(metricName="f1", labelCol="label", predictionCol="prediction")
f1 = evaluator_f1.evaluate(predictions)
print(f"F1 Score: {f1:.4f}")

evaluator_precision = MulticlassClassificationEvaluator(metricName="weightedPrecision", labelCol="label", predictionCol="prediction")
precision = evaluator_precision.evaluate(predictions)
print(f"Weighted Precision: {precision:.4f}")

evaluator_recall = MulticlassClassificationEvaluator(metricName="weightedRecall", labelCol="label", predictionCol="prediction")
recall = evaluator_recall.evaluate(predictions)
print(f"Weighted Recall: {recall:.4f}")

spark.stop()

Loaded 5792 rows initially, dropped 1 null rows, final count: 5791
Accuracy: 0.6411
F1 Score: 0.5710
Weighted Precision: 0.6222
Weighted Recall: 0.6411


- 2 : Thay thế LogisticRegression bằng NaiveBayes

In [3]:
from pyspark.ml.classification import LogisticRegression, NaiveBayes
# 1. Initialize Spark Session
spark = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()

# 2. Load Data
data_path = "D:/Study/NLP/Lab/week4/sentiments.csv"  
df = spark.read.csv(data_path, header=True, inferSchema=True)
# Convert -1/1 labels to 0/1: Normalize sentiment labels
df = df.withColumn("label", (col("sentiment").cast("integer") + 1) / 2)
# Drop rows with null sentiment values before processing
initial_row_count = df.count()
df = df.dropna(subset=["sentiment"])
print(f"Loaded {initial_row_count} rows initially, dropped {initial_row_count - df.count()} null rows, final count: {df.count()}")

# Split the data into training and test sets
trainingData, testData = df.randomSplit([0.8, 0.2], seed=42)

# 3. Build Preprocessing Pipeline
# Tokenizer: Splits text into words (tokens).
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# StopWordsRemover: Removes common stop words from the token list.
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# HashingTF: Converts a set of tokens into a fixed-size feature vector using a hashing technique.
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=10000)

# IDF (Inverse Document Frequency): Rescales the feature vectors produced by HashingTF.
idf = IDF(inputCol="raw_features", outputCol="features")

# 4. Train the Model
# NaiveBayes: The model used for classification.
nb = NaiveBayes(featuresCol="features", labelCol="label")

# Assemble the Pipeline: All steps are combined into a single Pipeline.
pipeline = Pipeline(stages=[tokenizer, stopwordsRemover, hashingTF, idf, nb])
# Training: Call pipeline.fit() on the training data.
model = pipeline.fit(trainingData)

# 5. Evaluate the Model
# Use model.transform() on the test data to get predictions.
predictions = model.transform(testData)

# MulticlassClassificationEvaluator is used to calculate metrics like accuracy and f1.
evaluator_accuracy = MulticlassClassificationEvaluator(metricName="accuracy", labelCol="label", predictionCol="prediction")
accuracy = evaluator_accuracy.evaluate(predictions)
print(f"Accuracy: {accuracy:.4f}")

evaluator_f1 = MulticlassClassificationEvaluator(metricName="f1", labelCol="label", predictionCol="prediction")
f1 = evaluator_f1.evaluate(predictions)
print(f"F1 Score: {f1:.4f}")

evaluator_precision = MulticlassClassificationEvaluator(metricName="weightedPrecision", labelCol="label", predictionCol="prediction")
precision = evaluator_precision.evaluate(predictions)
print(f"Weighted Precision: {precision:.4f}")

evaluator_recall = MulticlassClassificationEvaluator(metricName="weightedRecall", labelCol="label", predictionCol="prediction")
recall = evaluator_recall.evaluate(predictions)
print(f"Weighted Recall: {recall:.4f}")

spark.stop()

Loaded 5792 rows initially, dropped 1 null rows, final count: 5791
Accuracy: 0.6844
F1 Score: 0.6842
Weighted Precision: 0.6841
Weighted Recall: 0.6844


- 3 : Thay thế LogisticRegression bằng Neural Networks

In [None]:
from pyspark.ml.classification import LogisticRegression, MultilayerPerceptronClassifier
# 1. Initialize Spark Session
spark = SparkSession.builder.appName("SentimentAnalysis").getOrCreate()

# 2. Load Data
data_path = "D:/Study/NLP/Lab/week4/sentiments.csv"  
df = spark.read.csv(data_path, header=True, inferSchema=True)
# Convert -1/1 labels to 0/1: Normalize sentiment labels
df = df.withColumn("label", (col("sentiment").cast("integer") + 1) / 2)
# Drop rows with null sentiment values before processing
initial_row_count = df.count()
df = df.dropna(subset=["sentiment"])
print(f"Loaded {initial_row_count} rows initially, dropped {initial_row_count - df.count()} null rows, final count: {df.count()}")

# Split the data into training and test sets
trainingData, testData = df.randomSplit([0.8, 0.2], seed=42)

# 3. Build Preprocessing Pipeline
# Tokenizer: Splits text into words (tokens).
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# StopWordsRemover: Removes common stop words from the token list.
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# HashingTF: Converts a set of tokens into a fixed-size feature vector using a hashing technique.
# numFeatures=10000. Đây sẽ là input_size cho mạng nơ-ron.
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=10000)

# IDF (Inverse Document Frequency): Rescales the feature vectors produced by HashingTF.
idf = IDF(inputCol="raw_features", outputCol="features")

# 4. Train the Model
# Định nghĩa cấu trúc mạng nơ-ron (MLP)
# [Input, Hidden1, Hidden2, ..., Output]
# Input = 10000 (từ numFeatures của HashingTF)
# Output = 2 (vì label là 0 hoặc 1)
layers = [10000, 64, 32, 2] # Bạn có thể thử nghiệm các lớp ẩn (64, 32)

# Khởi tạo MultilayerPerceptronClassifier
mlp = MultilayerPerceptronClassifier(
    layers=layers,
    featuresCol="features",
    labelCol="label",
    maxIter=100,  # số vòng lặp huấn luyện
    seed=42
)

# Assemble the Pipeline: All steps are combined into a single Pipeline.
pipeline = Pipeline(stages=[tokenizer, stopwordsRemover, hashingTF, idf, mlp])


# Training: Call pipeline.fit() on the training data.
print("Starting model training (MLP)... This may take a while.")
model = pipeline.fit(trainingData)
print("Model training complete.")

# 5. Evaluate the Model
# Use model.transform() on the test data to get predictions.
predictions = model.transform(testData)

# MulticlassClassificationEvaluator is used to calculate metrics like accuracy and f1.
evaluator_accuracy = MulticlassClassificationEvaluator(metricName="accuracy", labelCol="label", predictionCol="prediction")
accuracy = evaluator_accuracy.evaluate(predictions)
print(f"Accuracy: {accuracy:.4f}")

evaluator_f1 = MulticlassClassificationEvaluator(metricName="f1", labelCol="label", predictionCol="prediction")
f1 = evaluator_f1.evaluate(predictions)
print(f"F1 Score: {f1:.4f}")

evaluator_precision = MulticlassClassificationEvaluator(metricName="weightedPrecision", labelCol="label", predictionCol="prediction")
precision = evaluator_precision.evaluate(predictions)
print(f"Weighted Precision: {precision:.4f}")

evaluator_recall = MulticlassClassificationEvaluator(metricName="weightedRecall", labelCol="label", predictionCol="prediction")
recall = evaluator_recall.evaluate(predictions)
print(f"Weighted Recall: {recall:.4f}")

spark.stop()

Loaded 5792 rows initially, dropped 1 null rows, final count: 5791
Starting model training (MLP)... This may take a while.
Model training complete.
Accuracy: 0.7755
F1 Score: 0.7736
Weighted Precision: 0.7730
Weighted Recall: 0.7755
