In [0]:
%file

ls /user/majesteye/DS05_INSURANCE_DATASET/input

In [1]:
%spark
sc.hadoopConfiguration.set("fs.defaultFS", "hdfs://namenode:9000")

In [2]:
%spark
import org.apache.spark.storage.StorageLevel

In [3]:
%spark
// Lecture des fichiers TSV
val clientFeaturesDF = spark.read
    .format("csv") // Pour TSV, utilisez toujours le format "csv"
    .option("header", "true")
    .option("inferSchema", "true")
    .option("delimiter", "\t") // Délimiteur tab pour .tsv
    .load("/user/majesteye/DS05_INSURANCE_DATASET/input/client_features.tsv")

val clientsPolicesDF = spark.read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .option("delimiter", "\t") // Délimiteur tab pour .tsv
    .load("/user/majesteye/DS05_INSURANCE_DATASET/input/clients_polices.tsv")

val clientsSinistreDF = spark.read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .option("delimiter", "\t") // Délimiteur tab pour .tsv
    .load("/user/majesteye/DS05_INSURANCE_DATASET/input/clients_sinistre.tsv")

// Optionnel : Afficher un échantillon pour vérifier
clientFeaturesDF.show()
clientsPolicesDF.show()
clientsSinistreDF.show()


In [4]:
%spark
clientFeaturesDF.persist(StorageLevel.MEMORY_AND_DISK)
clientsPolicesDF.persist(StorageLevel.MEMORY_AND_DISK)
clientsSinistreDF.persist(StorageLevel.MEMORY_AND_DISK)

In [5]:
%spark
// Step 1: Join clientFeaturesDF and clientsPolicesDF
val joinedDF1 = clientFeaturesDF
    .join(clientsPolicesDF, Seq("N_SOUSCRIP", "year"), "outer")

In [6]:
%spark
joinedDF1.persist(StorageLevel.MEMORY_AND_DISK)

In [7]:
%spark
joinedDF1.select("N_SOUSCRIP", "year", "N_POLICE", "Prime", "IsToutRisque").show(5)

In [8]:
%spark
// Step 2: Join the result with clientsSinistreDF
val finalDF = joinedDF1.select("N_SOUSCRIP", "year", "N_POLICE", "Prime", "IsToutRisque")
    .join(clientsSinistreDF, Seq("N_SOUSCRIP", "year", "N_POLICE"), "outer")


In [9]:
%spark
// Step 3: Show a sample of the final dataset
finalDF.select("N_SOUSCRIP", "year", "IsToutRisque", "N_POLICE").show(5)

In [10]:
%spark
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window

// Step 1: Join clientFeaturesDF and clientsPolicesDF
val joinedDF1 = clientFeaturesDF
    .join(clientsPolicesDF, Seq("N_SOUSCRIP", "year"), "outer")

// Step 2: Join the result with clientsSinistreDF
val finalDF = joinedDF1
    .join(clientsSinistreDF, Seq("N_SOUSCRIP", "year", "N_POLICE"), "outer")

// Step 3: Use Window function to get the row with the maximum year for each N_SOUSCRIP
val windowSpec = Window.partitionBy("N_SOUSCRIP").orderBy(col("year").desc)

// Step 4: Add a column to mark the row with the highest year for each N_SOUSCRIP
val finalWithMaxYearDF = finalDF
    .withColumn("rank", rank().over(windowSpec))
    .filter(col("rank") === 1) // Keep only the rows with rank 1 (the highest year)

// Step 5: Drop the rank column, if needed
val cleanedDF = finalWithMaxYearDF.drop("rank")

// Now cleanedDF contains only the rows with the highest year for each N_SOUSCRIP


In [11]:
%spark
cleanedDF.show(5)

In [12]:
%spark
