In [0]:
%file

ls /user/majesteye/DS05_INSURANCE_DATASET/input

In [1]:
%spark
sc.hadoopConfiguration.set("fs.defaultFS", "hdfs://namenode:9000")

<h1 align="center" style="color:blue;">General Exploration of "client_features" </h1>

In [3]:
%spark
// Lecture client_features
val cfdf = spark.read
    .format("csv") // Pour TSV, utilisez toujours le format "csv"
    .option("header", "true")
    .option("inferSchema", "true")
    .option("delimiter", "\t") // Délimiteur tab pour .tsv
    .load("/user/majesteye/DS05_INSURANCE_DATASET/input/client_features.tsv")


In [4]:
%spark
cfdf.show()

In [5]:
%spark
cfdf.printSchema()

In [6]:
%spark
cfdf.show(100)

In [7]:
cfdf.count()

In [8]:
// Check the distinct values in DF

In [9]:
cfdf.columns.foreach { column =>
  val distinctCount = cfdf.select(column).distinct().count()
  println(s"Distinct Count for $column: $distinctCount")
}

In [10]:
println(s"Row count: ${cfdf.count()}")
println(s"Column count: ${cfdf.columns.length}")
println(s"Distinct N_SOUSCRIP count: ${cfdf.select("N_SOUSCRIP").distinct().count()}")



### **Primary Keys:**
- <span style="color:blue;">️ **N_SOUSCRIP**</span>
- <span style="color:blue;">️ **year**</span>

### **Interesting Labels:**
- <span style="color:blue;"> **anciennete**</span>  
- <span style="color:blue;">️ **classe**</span>  
- <span style="color:blue;"> **Type_renouvellement_police**</span>


In [12]:
//How many Duplicate we have 

In [13]:
val duplicateRows = cfdf.groupBy(cfdf.columns.map(col): _*)
                        .count()
                        .filter("count > 1") 

In [14]:
duplicateRows.show()

In [15]:
// Are there missing values ?

In [16]:
cfdf.select(cfdf.columns.map(c => sum(when(col(c).isNull, 1).otherwise(0)).alias(c)): _*).show()

In [17]:
// Frequency Count
// Group and aggregate

In [18]:
cfdf.groupBy("anciennete").count().orderBy(desc("count")).show(99)

In [19]:
cfdf.groupBy("classe").count().orderBy(desc("count")).show(13)

In [20]:
cfdf.groupBy("Type_renouvellement_police").count().orderBy(desc("count")).show(4)

In [21]:
// Explore the primary key

In [22]:
cfdf.groupBy("N_SOUSCRIP").count().orderBy(desc("count")).show(10)

In [23]:
cfdf.filter($"N_SOUSCRIP" === "394490"  && $"year" === "2022").orderBy(desc("year")).show(100)

In [24]:
cfdf.filter($"N_SOUSCRIP" === "394490"  && $"year" === "2021").orderBy(desc("year")).show(100)

In [25]:
cfdf.filter($"N_SOUSCRIP" === "394490"  && $"year" === "2020").orderBy(desc("year")).show(100)



> <span style="color:red;">// As we see, our Interesting Labels don't change  
> // Donc, we can keep just one client per year </span>
 

In [27]:
cfdf.groupBy("year").count().orderBy(desc("count")).show(10)

In [28]:
cfdf.filter($"year" === "2021").orderBy(desc("classe")).show(100)

In [29]:
// Compute summary statistics {Mean (Average), stddev, variance, min, max} 
// of Interesting Labels

In [30]:
val summary = cfdf.describe()
val structuredSummary = summary
  .select("summary","anciennete", "classe")
  .withColumnRenamed("summary", "Statistics")
  .orderBy("Statistics") 
structuredSummary.show(truncate = false)

In [31]:
import org.apache.spark.sql.functions._

val stats = cfdf.select(
  variance("classe").alias("variance_classe"),
  variance("anciennete").alias("variance_anciennete")
)

stats.show()

In [32]:
//  The distribution of Interesting labels:
//  Is there any bias in the dataset with respect to classe (classe) and seniority (anciennete)
//  and type of policy renewal  (Type_renouvellement_police)??

In [33]:
// classe (classe)

In [34]:

val genderDistribution = cfdf.groupBy("classe")
  .agg(count("*").alias("count"))
  .orderBy(desc("count"))
genderDistribution.show()




In [35]:

val genderPercentage = cfdf.groupBy("classe")
  .agg(
    (count("*") / cfdf.count() * 100).alias("percentage")
  )
  .orderBy(desc("percentage"))
genderPercentage.show()

In [36]:
cfdf.createOrReplaceTempView("dataset")

In [37]:
%sql
SELECT 
    classe, 
    COUNT(*) AS count,
    (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM dataset)) AS percentage
FROM dataset
GROUP BY classe

ORDER BY count DESC;

In [38]:
// type of policy renewal  (Type_renouvellement_police)

In [39]:


val genderDistribution = cfdf.groupBy("Type_renouvellement_police")
  .agg(count("*").alias("count"))
  .orderBy(desc("count"))
genderDistribution.show()



In [40]:

val genderPercentage = cfdf.groupBy("Type_renouvellement_police")
  .agg(
    (count("*") / cfdf.count() * 100).alias("percentage")
  )
  .orderBy(desc("percentage"))
genderPercentage.show()

In [41]:
%sql
SELECT 
    Type_renouvellement_police, 
    COUNT(*) AS count,
    (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM dataset)) AS percentage
FROM dataset
GROUP BY Type_renouvellement_police

ORDER BY count DESC;

In [42]:
// Relation between anciennete and classe

In [43]:
%sql
SELECT 
    anciennete, 
    classe, 
    COUNT(*) AS count,
    (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM dataset)) AS percentage
FROM dataset
GROUP BY anciennete, classe
ORDER BY count DESC;

In [44]:
// Relation between Type_renouvellement_police and classe

In [45]:
%sql
SELECT 
    Type_renouvellement_police, 
    classe, 
    COUNT(*) AS count,
    (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM dataset)) AS percentage
FROM dataset
GROUP BY Type_renouvellement_police, classe
ORDER BY count DESC;

In [46]:
// // Relation between Type_renouvellement_police and anciennete

In [47]:
%sql
SELECT 
    Type_renouvellement_police, 
    anciennete, 
    COUNT(*) AS count,
    (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM dataset)) AS percentage
FROM dataset
GROUP BY Type_renouvellement_police, anciennete
ORDER BY count DESC;

<h1 align="center" style="color:orange;">General Exploration of "clients_polices" </h1>

In [49]:
%spark
sc.hadoopConfiguration.set("fs.defaultFS", "hdfs://namenode:9000")

In [50]:
%spark
// Lecture clients_polices
val cpdf = spark.read
    .format("csv") // Pour TSV, utilisez toujours le format "csv"
    .option("header", "true")
    .option("inferSchema", "true")
    .option("delimiter", "\t") // Délimiteur tab pour .tsv
    .load("/user/majesteye/DS05_INSURANCE_DATASET/input/clients_polices.tsv")

In [51]:
cpdf.show()

In [52]:
cpdf.printSchema()

In [53]:
cpdf.show(100)

In [54]:
cpdf.count()

In [55]:
// Check the distinct values in DF

In [56]:
cpdf.columns.foreach { column =>
  val distinctCount = cpdf.select(column).distinct().count()
  println(s"Distinct Count for $column: $distinctCount")
}



> <span style="color:red;">// just a remark here:
nombre of N_SOUSCRIP in cfdf =nombre of N_SOUSCRIP in cpdf </span>


In [58]:
println(s"Row count: ${cpdf.count()}")
println(s"Column count: ${cpdf.columns.length}")
println(s"Distinct N_SOUSCRIP count: ${cpdf.select("N_SOUSCRIP").distinct().count()}")

In [59]:
println(s"Row count: ${cpdf.count()}")
println(s"Column count: ${cpdf.columns.length}")
println(s"Distinct N_POLICE count: ${cpdf.select("N_POLICE").distinct().count()}")



### **Primary Keys:**
- <span style="color:blue;">️ **N_SOUSCRIP**</span>
- <span style="color:blue;">️ **year**</span>
- <span style="color:blue;">️ **N_POLICE**</span>

### **Interesting Labels:**
- <span style="color:blue;"> **IsToutRisque**</span>  



In [61]:
//How many Duplicate we have

In [62]:
val duplicateRows = cpdf.groupBy(cpdf.columns.map(col): _*)
                        .count()
                        .filter("count > 1")

In [63]:
duplicateRows.show()

In [64]:
// Are there missing values ?

In [65]:
cpdf.select(cpdf.columns.map(c => sum(when(col(c).isNull, 1).otherwise(0)).alias(c)): _*).show()

In [66]:
// Frequency Count
// Group and aggregate

In [67]:
cpdf.groupBy("IsToutRisque").count().orderBy(desc("count")).show()

In [68]:
// Explore the primary key

In [69]:
cpdf.groupBy("N_SOUSCRIP").count().orderBy(desc("count")).show(10)

In [70]:
cpdf.filter($"N_SOUSCRIP" === "642214"  && $"year" === "2022").orderBy(desc("year")).show(100)

In [71]:
cpdf.groupBy("N_OBJET_ASS").count().orderBy(desc("count")).show(10)

In [72]:
cpdf.groupBy("N_POLICE").count().orderBy(desc("count")).show(10)

In [73]:


cpdf.filter($"N_SOUSCRIP" === "413736" && $"N_POLICE" === "1.01046416E8"  && $"year" === "2022" && $"N_OBJET_ASS" === "2140TU207").orderBy(desc("year")).show(100)



> <span style="color:red;">// As we see, our Interesting Labels are change 
> // Donc, We need to keep one client per year while adding a rule that avoids bias.  </span>

In [75]:
//  The distribution of Interesting labels:
//  Is there any bias in the dataset with respect to IsToutRisque ??

In [76]:

val genderDistribution = cpdf.groupBy("IsToutRisque")
  .agg(count("*").alias("count"))
  .orderBy(desc("count"))
genderDistribution.show()



In [77]:

val genderPercentage = cpdf.groupBy("IsToutRisque")
  .agg(
    (count("*") / cpdf.count() * 100).alias("percentage")
  )
 genderPercentage.show()

In [78]:
cpdf.createOrReplaceTempView("dataset1")

In [79]:
%sql
SELECT 
    IsToutRisque, 
    COUNT(*) AS count,
    (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM dataset)) AS percentage
FROM dataset1
GROUP BY IsToutRisque

ORDER BY count DESC;

In [80]:
//


In [81]:
%sql
SELECT 
    dataset1.IsToutRisque, 
    dataset.classe, 
    COUNT(*) AS count,
    (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM dataset1)) AS percentage
FROM dataset1
JOIN dataset ON dataset1.N_SOUSCRIP = dataset.N_SOUSCRIP
GROUP BY dataset1.IsToutRisque, dataset.classe
ORDER BY count DESC;


In [82]:
%sql
SELECT 
    dataset1.IsToutRisque, 
    dataset.Type_renouvellement_police, 
    COUNT(*) AS count,
    (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM dataset1)) AS percentage
FROM dataset1
JOIN dataset ON dataset1.N_SOUSCRIP = dataset.N_SOUSCRIP
GROUP BY dataset1.IsToutRisque, dataset.Type_renouvellement_police
ORDER BY count DESC;
