In [0]:
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.classification.{RandomForestClassifier, RandomForestClassificationModel}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql.functions._

In [1]:
val labeledDf = spark.read
              .option("header", "true")
              .options(Map("inferSchema" -> "true", "delimiter" -> "\t"))
              .csv("file:///team5/data/LabeledFile.csv")

In [2]:
//let's analyze categorical columns to see number of unique values
println("Analyzing categorical features:")
val categoricalCols = Array("marque", "usage", "Type_renouvellement_police", "fractionnement", "IsToutRisque")
categoricalCols.foreach { colName =>
  val uniqueCount = labeledDf.select(colName).distinct().count()
  println(s"Column $colName has $uniqueCount unique values")
}


In [3]:
// let's prepare the categorical columns for encoding
val numericCols = Array("Prime", "Sinistre", "puissance", "age_objet_assuree", "valeur_venale", 
                       "valeur_neuve", "Charge_utile", "anciennete", "classe", "age_client")

In [4]:
// Create string indexers for categorical columns
val indexers = categoricalCols.map { colName =>
  new StringIndexer()
    .setInputCol(colName)
    .setOutputCol(colName + "_indexed")
    .setHandleInvalid("keep")
}

In [5]:
// to delete

val pipeline = new Pipeline().setStages(indexers.toArray)
val transformedDf = pipeline.fit(labeledDf).transform(labeledDf)

transformedDf.select("usage","usage_indexed","Type_renouvellement_police","Type_renouvellement_police_indexed","fractionnement","fractionnement_indexed","IsToutRisque","IsToutRisque_indexed").distinct().show()

In [6]:
// Create vector assembler for feature columns
val assembler = new VectorAssembler()
  .setInputCols((numericCols ++ categoricalCols.map(_ + "_indexed")))
  .setOutputCol("features")
  .setHandleInvalid("keep")

In [7]:
// Create label indexer
val labelIndexer = new StringIndexer()
  .setInputCol("Risky")
  .setOutputCol("label")
  .setHandleInvalid("keep")

In [8]:
// Split the data
val Array(trainingData, testData) = labeledDf.randomSplit(Array(0.8, 0.2), seed = 1234)

In [9]:
trainingData.show()

In [10]:
// Create Random Forest Classifier with increased maxBins
val rf = new RandomForestClassifier()
  .setLabelCol("label")
  .setFeaturesCol("features")
  .setNumTrees(100)
  .setMaxDepth(10)
  .setMaxBins(200)  // Increased from 32 to 200
  .setSeed(1234)


In [11]:
// Create the pipeline
val pipeline = new Pipeline()
  .setStages(indexers ++ Array(labelIndexer, assembler, rf))


In [12]:
// Train model
val model = pipeline.fit(trainingData)


In [13]:
// Make predictions on test data
val predictions = model.transform(testData)


In [14]:
// Select example rows to display
predictions.select("prediction", "label", "features").show(5,false)

In [15]:
// Evaluate model
val evaluator = new MulticlassClassificationEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("accuracy")

val accuracy = evaluator.evaluate(predictions)
println(s"\nAccuracy = ${accuracy}")

In [16]:
// Calculate F1 Score
val f1Evaluator = new MulticlassClassificationEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("f1")

val f1Score = f1Evaluator.evaluate(predictions)
println(s"F1 Score = ${f1Score}")

// Calculate additional metrics for a more complete evaluation
val precisionEvaluator = f1Evaluator.setMetricName("weightedPrecision")
val recallEvaluator = f1Evaluator.setMetricName("weightedRecall")

val precision = precisionEvaluator.evaluate(predictions)
val recall = recallEvaluator.evaluate(predictions)

println("\nDetailed Metrics:")
println(f"Precision = ${precision}%.4f")
println(f"Recall = ${recall}%.4f")
println(f"F1 Score = ${f1Score}%.4f")

// Show confusion matrix
println("\nConfusion Matrix:")
predictions.groupBy("label", "prediction")
  .count()
  .orderBy("label", "prediction")
  .show()

In [17]:
// Get the Random Forest model from the pipeline and calculate feature importance
val rfModel = model.stages.last.asInstanceOf[RandomForestClassificationModel]
val featureImportances = rfModel.featureImportances

// Create a list of feature names (both numeric and categorical)
val featureNames = numericCols ++ categoricalCols.map(_ + "_indexed")

// Print feature importances
println("\nFeature Importances:")
featureNames.zip(featureImportances.toArray).sortBy(-_._2).foreach { case (feature, importance) =>
  println(f"Feature: $feature, Importance: $importance%.4f")
}

// Save feature importances to a DataFrame for better visualization
val importanceDF = spark.createDataFrame(
  featureNames.zip(featureImportances.toArray).map { case (feature, importance) => 
    (feature, importance)
  }
).toDF("feature", "importance")
  .orderBy($"importance".desc)

importanceDF.show(false)

// Print model parameters
println("\nModel Parameters:")
println(s"Number of trees: ${rfModel.getNumTrees}")
println(s"Max depth: ${rfModel.getMaxDepth}")
println(s"Max bins: ${rfModel.getMaxBins}")