In [0]:
#a) Preprocessing 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.appName("IncomeClassification").getOrCreate()
df = spark.read.csv("dbfs:/FileStore/tables/adult_modified.csv", header=True, inferSchema=True)

print(df.dtypes)

dfc = df.dropna(subset=["workclass", "marital-status", "race", "sex", "income"])
num = ["age", "education", "hours-per-week"]

dfp = dfc
for i in num:
    mean_value = dfc.selectExpr(f"mean(`{i}`) as mean").collect()[0]["mean"]
    dfp = dfp.fillna({i: mean_value})

dfp = dfp.withColumn("age", col("age").cast("int"))
dfp = dfp.withColumn("education", col("education").cast("int"))
dfp = dfp.withColumn("hours-per-week", col("hours-per-week").cast("int"))
dfp = dfp.dropna()

# Encode String Indexer
cat = ["workclass", "marital-status", "race", "sex", "income"]
for j in cat:
    indexer = StringIndexer(inputCol=j, outputCol=j+"_index").fit(dfp)
    dfp = indexer.transform(dfp)

# Vector assembler
feature_cols = ["age", "education", "hours-per-week", "workclass_index", "marital-status_index", "race_index", "sex_index"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_features = assembler.transform(dfp)

train_data, test_data = df_features.randomSplit([0.8, 0.2], seed=42)

[('age', 'string'), ('workclass', 'string'), ('education', 'int'), ('marital-status', 'string'), ('race', 'string'), ('sex', 'string'), ('hours-per-week', 'int'), ('income', 'string')]


In [0]:
# Decision Tree Classifier
dt = DecisionTreeClassifier(labelCol="income_index", featuresCol="features")
dt_model = dt.fit(train_data)
dt_predictions = dt_model.transform(test_data)

evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="income_index", metricName="accuracy")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="income_index", metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="income_index", metricName="weightedRecall")

dt_accuracy = evaluator_accuracy.evaluate(dt_predictions)
dt_precision = evaluator_precision.evaluate(dt_predictions)
dt_recall = evaluator_recall.evaluate(dt_predictions)

print("Decision Tree Model EvalAuation:")
print(f"Accuracy: {dt_accuracy:.4f}")
print(f"Precision: {dt_precision:.4f}")
print(f"Recall: {dt_recall:.4f}\n")

Decision Tree Model EvalAuation:
Accuracy: 0.8125
Precision: 0.7992
Recall: 0.8125



In [0]:
# Random Forest Classifier
rf = RandomForestClassifier(labelCol="income_index", featuresCol="features", numTrees=100)
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)

rf_accuracy = evaluator_accuracy.evaluate(rf_predictions)
rf_precision = evaluator_precision.evaluate(rf_predictions)
rf_recall = evaluator_recall.evaluate(rf_predictions)

print("Random Forest Model Evaluation:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}\n")

Random Forest Model Evaluation:
Accuracy: 0.8078
Precision: 0.7937
Recall: 0.8078

