In [0]:
%file
ls /user/majesteye/DS05_INSURANCE_DATASET/input


In [1]:
%spark
sc.hadoopConfiguration.set("fs.defaultFS", "hdfs://namenode:9000")

In [2]:
val df = spark.read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .option("delimiter", ",")
    .load("/user/majesteye/DS05_INSURANCE_DATASET/input/*.csv")


In [3]:
%spark

df.createOrReplaceTempView("DF_TBL")

In [4]:
//Displaying the Data

In [5]:
df.printSchema()

In [6]:
df.show(20)


In [7]:
// Display the number of columns

In [8]:
val columnCount = df.columns.length
println(s"Number of columns: $columnCount")

In [9]:
// Display the number of rows

In [10]:
val rowCount = df.count()
println(s"Number of rows: $rowCount")

In [11]:
// Valeur Venal Exploration "VV"

In [12]:
import org.apache.spark.sql.functions._

// Column to analyze
val columnToAnalyze = "VV"

// Count distinct non-null values
val distinctNonNullValues = df.filter(col(columnToAnalyze).isNotNull).select(columnToAnalyze).distinct().count()

// Count of null values
val nullCount = df.filter(col(columnToAnalyze).isNull).count()
val nullPercentage = (nullCount.toDouble / df.count()) * 100

// Count frequencies of distinct non-null values
val nonNullValues = df.filter(col(columnToAnalyze).isNotNull)
  .groupBy(columnToAnalyze)
  .count()
  .orderBy(col("count").desc)

// Display the results in a structured way
println(s"Column: $columnToAnalyze")
println(s"Distinct Values (excluding nulls): $distinctNonNullValues")
println(f"Null Values: $nullCount ($nullPercentage%.2f%%)")

println("\nDistinct Values (excluding nulls) and their Frequencies:")
nonNullValues.show(truncate = false)


In [13]:
%sql
SELECT 
    VV,
    COUNT(*) AS count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percentage
FROM DF_TBL
WHERE VV IS NOT NULL
GROUP BY VV
ORDER BY VV

In [14]:
// Type D'activité Exploration "ACT"

In [15]:
import org.apache.spark.sql.functions._

// Column to analyze
val columnToAnalyze = "ACT"

// Count distinct non-null values
val distinctNonNullValues = df.filter(col(columnToAnalyze).isNotNull).select(columnToAnalyze).distinct().count()

// Count of null values
val nullCount = df.filter(col(columnToAnalyze).isNull).count()
val nullPercentage = (nullCount.toDouble / df.count()) * 100

// Count frequencies of distinct non-null values
val nonNullValues = df.filter(col(columnToAnalyze).isNotNull)
  .groupBy(columnToAnalyze)
  .count()
  .orderBy(col("count").desc)

// Display the results in a structured way
println(s"Column: $columnToAnalyze")
println(s"Distinct Values (excluding nulls): $distinctNonNullValues")
println(f"Null Values: $nullCount ($nullPercentage%.2f%%)")

println("\nDistinct Values (excluding nulls) and their Frequencies:")
nonNullValues.show(numRows = Int.MaxValue, truncate = false) // Display all rows without truncation


In [16]:
%sql
SELECT 
    ACT,
    COUNT(*) AS count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percentage
FROM DF_TBL
WHERE ACT IS NOT NULL
GROUP BY ACT
ORDER BY ACT

In [17]:
//Age de souscripteur "AGE"

In [18]:
import org.apache.spark.sql.functions._

// Column to analyze
val columnToAnalyze = "AGE"

// Count distinct non-null values
val distinctNonNullValues = df.filter(col(columnToAnalyze).isNotNull).select(columnToAnalyze).distinct().count()

// Count of null values
val nullCount = df.filter(col(columnToAnalyze).isNull).count()
val nullPercentage = (nullCount.toDouble / df.count()) * 100

// Count frequencies of distinct non-null values
val nonNullValues = df.filter(col(columnToAnalyze).isNotNull)
  .groupBy(columnToAnalyze)
  .count()
  .orderBy(col("count").desc)

// Display the results in a structured way
println(s"Column: $columnToAnalyze")
println(s"Distinct Values (excluding nulls): $distinctNonNullValues")
println(f"Null Values: $nullCount ($nullPercentage%.2f%%)")

println("\nDistinct Values (excluding nulls) and their Frequencies:")
nonNullValues.show(numRows = Int.MaxValue, truncate = false) // Display all rows without truncation


In [19]:
%sql
SELECT 
    AGE,
    COUNT(*) AS count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percentage
FROM DF_TBL
WHERE AGE IS NOT NULL
GROUP BY AGE
ORDER BY AGE

In [20]:
//Age de Vehicule Exploration "AGO"

In [21]:
import org.apache.spark.sql.functions._

// Column to analyze
val columnToAnalyze = "AGO"

// Count distinct non-null values
val distinctNonNullValues = df.filter(col(columnToAnalyze).isNotNull).select(columnToAnalyze).distinct().count()

// Count of null values
val nullCount = df.filter(col(columnToAnalyze).isNull).count()
val nullPercentage = (nullCount.toDouble / df.count()) * 100

// Count frequencies of distinct non-null values
val nonNullValues = df.filter(col(columnToAnalyze).isNotNull)
  .groupBy(columnToAnalyze)
  .count()
  .orderBy(col("count").desc)

// Display the results in a structured way
println(s"Column: $columnToAnalyze")
println(s"Distinct Values (excluding nulls): $distinctNonNullValues")
println(f"Null Values: $nullCount ($nullPercentage%.2f%%)")

println("\nDistinct Values (excluding nulls) and their Frequencies:")
nonNullValues.show(numRows = Int.MaxValue, truncate = false) // Display all rows without truncation

In [22]:
%sql
SELECT 
    AGO,
    COUNT(*) AS count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percentage
FROM DF_TBL
WHERE AGO IS NOT NULL
GROUP BY AGO
ORDER BY AGO

In [23]:
// Années d'Ancienneté "ANC"

In [24]:
import org.apache.spark.sql.functions._

// Column to analyze
val columnToAnalyze = "ANC"

// Verify if the column exists
if (df.columns.contains(columnToAnalyze)) {
  // Count distinct non-null values
  val distinctNonNullValues = df.filter(col(columnToAnalyze).isNotNull).select(columnToAnalyze).distinct().count()

  // Count of null values
  val nullCount = df.filter(col(columnToAnalyze).isNull).count()
  val nullPercentage = (nullCount.toDouble / df.count()) * 100

  // Count frequencies of distinct non-null values
  val nonNullValues = df.filter(col(columnToAnalyze).isNotNull)
    .groupBy(columnToAnalyze)
    .count()
    .orderBy(col("count").desc)

  // Display the results in a structured way
  println(s"Column: $columnToAnalyze")
  println(s"Distinct Values (excluding nulls): $distinctNonNullValues")
  println(f"Null Values: $nullCount ($nullPercentage%.2f%%)")

  println("\nDistinct Values (excluding nulls) and their Frequencies:")
  nonNullValues.show(numRows = Int.MaxValue, truncate = false) // Show all rows without truncation
} else {
  println(s"Column '$columnToAnalyze' does not exist in the dataset.")
}


In [25]:
%sql
SELECT 
    ANC,
    COUNT(*) AS count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percentage
FROM DF_TBL
WHERE ANC IS NOT NULL
GROUP BY ANC
ORDER BY ANC

In [26]:
// Type de Renouvellemnt de Police "C"

In [27]:
import org.apache.spark.sql.functions._

// Column to analyze
val columnToAnalyze = "C"

// Verify if the column exists
if (df.columns.contains(columnToAnalyze)) {
  // Count distinct non-null values
  val distinctNonNullValues = df.filter(col(columnToAnalyze).isNotNull).select(columnToAnalyze).distinct().count()

  // Count of null values
  val nullCount = df.filter(col(columnToAnalyze).isNull).count()
  val nullPercentage = (nullCount.toDouble / df.count()) * 100

  // Count frequencies of distinct non-null values
  val nonNullValues = df.filter(col(columnToAnalyze).isNotNull)
    .groupBy(columnToAnalyze)
    .count()
    .orderBy(col("count").desc)

  // Display the results in a structured way
  println(s"Column: $columnToAnalyze")
  println(s"Distinct Values (excluding nulls): $distinctNonNullValues")
  println(f"Null Values: $nullCount ($nullPercentage%.2f%%)")

  println("\nDistinct Values (excluding nulls) and their Frequencies:")
  nonNullValues.show(numRows = Int.MaxValue, truncate = false) // Show all rows without truncation
} else {
  println(s"Column '$columnToAnalyze' does not exist in the dataset.")
}


In [28]:
%sql
SELECT 
    C,
    COUNT(*) AS count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) AS percentage
FROM DF_TBL
WHERE C IS NOT NULL
GROUP BY C
ORDER BY C

In [29]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window 

// Filter out nulls from VV and RISKY, and analyze the distribution
val vvRiskyDistribution = df.filter(col("VV").isNotNull && (col("RISKY") === "Y" || col("RISKY") === "N"))
  .groupBy("VV", "RISKY")
  .count()
  .withColumn("total_per_vv", sum("count").over(Window.partitionBy("VV")))
  .withColumn("percentage", (col("count") / col("total_per_vv")) * 100)
  .select(
    col("VV"),
    col("RISKY"),
    col("count"),
    round(col("percentage"), 2).alias("percentage")
  )
  .orderBy("VV", "RISKY")

vvRiskyDistribution.show()

In [30]:
import org.apache.spark.sql.functions._

df.columns.foreach { column =>
  val distinctCount = df.filter(col(column).isNotNull).select(column).distinct().count()
  println(s"Column: $column, Distinct values (excluding nulls): $distinctCount")
}



In [31]:
val columnsToShow = Seq("ACT", "CRS", "GOV")

columnsToShow.foreach { column =>
  val distinctValues = df.select(column).filter(col(column).isNotNull).distinct().collect().map(_.get(0))
  println(s"Distinct values for column $column: ${distinctValues.mkString("[", ", ", "]")}")
  println()
}


In [32]:
val columnsToShow = Seq("VN", "AGE", "AGO", "ANC", "C", "CIV", "CLS", "CU", "DG", "EN", "FRC", "NFC", "PLA", "PSS", "RISKY",
"SX", "USG", "VV","DLG","MRQ","CEN")
columnsToShow.foreach { column =>
  val distinctValues = df.select(column)
    .filter(col(column).isNotNull)
    .distinct()
    .collect()
    .map(_.get(0))
  println(s"Distinct values for column $column: ${distinctValues.mkString("[", ", ", "]")}")
}

In [33]:
//Null Value Analysis

In [34]:
df.columns.foreach { column =>
  val totalRows = df.count()
  val nullCount = df.filter(col(column).isNull).count()
  val nullPercentage = (nullCount.toDouble / totalRows) * 100
  println(s"Column: $column, Null values: $nullCount ($nullPercentage%)")
}

In [35]:
df.groupBy("RISKY").count().show()

In [36]:
//Analyze Tendency (mean, stddev, etc.)

In [37]:
//Numerical Data

In [38]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

// List of numeric columns for analysis
val numericColumns = Seq("VN", "AGE", "AGO", "ANC", "CLS", "CU", "FRC", "NFC", "PLA", "PSS", "VV")

// Calculate basic statistics (mean, min, max, etc.) for all numeric columns
df.describe(numericColumns: _*).show()

// For each numeric column, calculate mean, standard deviation, min, max, kurtosis, and skewness
numericColumns.foreach { column =>
  
  // Calculate the mean (average) value for the column
  val meanValue = df.agg(avg(col(column))).first().get(0)
  
  // Calculate the standard deviation for the column
  val stddevValue = df.agg(stddev(col(column))).first().get(0)
  
  // Calculate the minimum value for the column
  val minValue = df.agg(min(col(column))).first().get(0)
  
  // Calculate the maximum value for the column
  val maxValue = df.agg(max(col(column))).first().get(0)
  
  // Calculate the kurtosis (measure of peakedness) for the column
  val kurtosisValue = df.agg(kurtosis(col(column))).first().get(0)
  
  // Calculate the skewness (measure of asymmetry) for the column
  val skewnessValue = df.agg(skewness(col(column))).first().get(0)
  
  // Print out the calculated statistics for each column
  println(s"Column: $column, Mean: $meanValue, StdDev: $stddevValue, Min: $minValue, Max: $maxValue, Kurtosis: $kurtosisValue, Skewness: $skewnessValue")
}


In [39]:
//Categorial Data

In [40]:
import org.apache.spark.sql.functions._

// List of categorical columns for analysis
val categoricalColumns = Seq("C", "CIV", "DG", "EN", "RISKY", "SX", "USG", "CRS", "GOV", "ACT")

// For each categorical column, calculate the distinct values and their frequencies
categoricalColumns.foreach { column =>
  
  // Calculate the distinct values for the column
  val distinctValues = df.select(column).distinct().collect().map(_.get(0))
  
  // Print a header for the column
  println(s"\n----- Distinct values for column: $column -----")
  
  // Print the distinct values for the column
  println(s"Distinct values: ${distinctValues.mkString("[", ", ", "]")}\n")
  
  // For each distinct value, count how many times it appears in the column
  distinctValues.foreach { value =>
    val valueCount = df.filter(col(column) === value).count()
    println(f"Value: $value%-20s Count: $valueCount")
  }
  
  // Add a separator for better readability
  println("\n" + "="*50 + "\n")
}


In [41]:
%spark
