<h1> Ilhem Final EDA </h1>

In [1]:
%file
ls /user/majesteye/DS05_INSURANCE_DATASET/input

In [2]:
%spark
sc.hadoopConfiguration.set("fs.defaultFS", "hdfs://namenode:9000")

<h1> Exploring data of file 1 : client_features.tsv </h1>

In [4]:

%spark
val DF_CF = spark.read
    .format("csv")               
    .option("header", "true")    
    .option("inferSchema", "true") // Infers column data types
    .option("delimiter", "\t")   // Specify tab as the delimiter
    .load("/user/majesteye/DS05_INSURANCE_DATASET/input/client_features.tsv")

DF_CF.printSchema() // Prints the schema of the DataFrame
DF_CF.show(10) 
DF_CF.count()

In [5]:
%spark
//Summary of the dataset
DF_CF.select("N_SOUSCRIP", "year", "age_client", "anciennete", "civilite","delegation","gouvernorat").describe().show()
DF_CF.select( "classe","Type_renouvellement_police","activite","sexe","direction_regionale","centre").describe().show()


In [6]:
%spark
//Identifier les valeurs nulles par colonne
import org.apache.spark.sql.functions._

val nullCounts = DF_CF.columns.map { colName =>
  count(when(col(colName).isNull || col(colName) === "", colName)).alias(colName)
}

DF_CF.select(nullCounts: _*).show()



In [7]:
%spark
//Count distinct values

val distinctCounts = DF_CF.columns.map { colName =>
  val distinctCount = DF_CF.select(col(colName)).distinct().count()
  (colName, distinctCount)
}


distinctCounts.foreach { case (colName, count) =>
  println(s"Column '$colName' has $count distinct values.")
}



In [8]:
%spark
DF_CF.select("sexe").distinct().show()


In [9]:
%spark
DF_CF.select("civilite").distinct().show()

In [10]:

%spark
DF_CF.select("age_client").distinct().show(400)

<h3> We notice that the dataset is incoherent and inconsistant:<br>
- Lot of null values <br>
- The primary key (N_SOUSCRIP, year) is repeated many times <br>
- Incoherent data : (sexe with the input (J,CP) and civilite with the input (entreprise, org,etablissement, gov...) <br>
- Inconsistant data (age = 128, 124 ) <br>
Therefore we need to clean data and delete the duplicates (keep a une client ID with the latest year) <br>
</h3>



In [12]:
%spark
// Import necessary libraries
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window

// Replace null values with default or meaningful placeholders
val cleanedDF = DF_CF
  .withColumn("sexe", when(col("sexe").isin("M", "F"), col("sexe")).otherwise("Unknown"))
  .withColumn("civilite", when(col("civilite").isin("Mr", "Mme"), col("civilite")).otherwise("Other"))
  .withColumn("age_client", when(col("age_client").between(18, 100), col("age_client")).otherwise(null))

// Remove rows with critical null values (e.g., primary keys)
val filteredDF = cleanedDF.na.drop(Seq("N_SOUSCRIP", "year"))

// Identify and remove duplicates based on client_id and year, keeping the latest year
val windowSpec = Window.partitionBy("N_SOUSCRIP").orderBy(col("year").desc)
val DF_CF_Final = filteredDF.withColumn("row_num", row_number().over(windowSpec))
  .filter(col("row_num") === 1)
  .drop("row_num")

// Show cleaned data
DF_CF_Final.show()




In [13]:
%spark
DF_CF_Final.count()

<h1> Exploring data of file 2 : client_polices.tsv </h1>

In [15]:
%spark
//Load the Second Dataset
val DF_CP = spark.read
    .format("csv")                
    .option("header", "true")     
    .option("inferSchema", "true") 
    .option("delimiter", "\t")     
    .load("/user/majesteye/DS05_INSURANCE_DATASET/input/clients_polices.tsv") 
    
// Inspect the Schema
println("Schema of the second dataset:")
DF_CP.printSchema()

//Display a Sample of the Data
println("Sample rows from the dataset:")
DF_CP.show(10)

//Count values
println("Number of rows from the dataset:")
DF_CP.count()


In [16]:
%spark
//Summary of the dataset
DF_CP.describe().show()


In [17]:
%spark
// Missing Values per Column
import org.apache.spark.sql.functions._

val nullCounts = DF_CP.columns.map { colName =>
  count(when(col(colName).isNull || col(colName) === "", colName)).alias(colName)
}

println("Count of missing values per column:")
DF_CP.select(nullCounts: _*).show()

In [18]:
%spark
// Count Distinct Value
println("Distinct value counts for each column:")
val distinctCounts = DF_CP.columns.map { colName =>
  val distinctCount = DF_CP.select(col(colName)).distinct().count()
  (colName, distinctCount)
}

distinctCounts.foreach { case (colName, count) =>
  println(s"Column '$colName' has $count distinct values.")
}


In [19]:
import org.apache.spark.sql.functions._

// Create a count of rows grouped by all columns
val duplicatesDF = DF_CP.groupBy(DF_CP.columns.map(col): _*)
  .count()
  .filter(col("count") > 1) // Filter only duplicates (count > 1)

// Show duplicate rows
println("Duplicate rows:")
duplicatesDF.show(truncate = false)


In [20]:
%spark
