In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Step 1: Start Spark session
spark = SparkSession.builder.appName("TwitterHateTweetDetection").getOrCreate()

# Step 2: Load dataset (CSV format)
df = spark.read.csv("twitter.csv", header=True, inferSchema=True)

# Step 3: Rename and select necessary columns
df = df.selectExpr("id", "cast(label as int) as label", "tweet")

# Step 4: Text preprocessing pipeline
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
vectorizer = CountVectorizer(inputCol="filtered", outputCol="features")

# Step 5: Model - Logistic Regression
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Step 6: Build Pipeline
pipeline = Pipeline(stages=[tokenizer, remover, vectorizer, lr])

# Step 7: Split into training/testing sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Step 8: Train model
model = pipeline.fit(train_data)

# Step 9: Predict on test data
predictions = model.transform(test_data)
predictions.select("tweet", "label", "prediction", "probability").show(5, truncate=False)

# Step 10: Evaluation
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy (AUC): {accuracy:.2f}")

# Step 11: Stop Spark
spark.stop()


+---------------------------------------------------------------------------------------+-----+----------+-----------------------------+
|tweet                                                                                  |label|prediction|probability                  |
+---------------------------------------------------------------------------------------+-----+----------+-----------------------------+
|  bihday your majesty                                                                  |0    |0.0       |[1.0,0.0]                    |
| @user camping tomorrow @user @user @user @user @user @user @user dannyâ¦             |0    |0.0       |[1.0,0.0]                    |
|we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers  â¦ |0    |0.0       |[1.0,0.0]                    |
|@user #cnn calls #michigan middle school 'build the wall' chant '' #tcot               |1    |1.0       |[4.5455657176155325E-171,1.0]|
|as we all know, essential oils are not m

Viva-Style Concepts
Concept	Explanation
Tokenizer	Splits tweets into individual words
StopWordsRemover	Removes common words like "is", "the", "and"
CountVectorizer / TF-IDF	Converts text into numeric feature vectors
Logistic Regression	Binary classifier for hate vs. not hate
BinaryClassificationEvaluator	Measures model performance (AUC or accuracy)