In [0]:
%sh
ls /team5/data/

In [1]:
%spark
sc.hadoopConfiguration.set("fs.defaultFS", "hdfs://namenode:9000")

In [2]:
%sh
ls /flatFile.csv

<h1 align="center">General Schema</h1>

In [4]:
// Show and Print "flatFile.csv" Schema
val df = spark.read
    .format("csv")               
    .option("header", "true")   
    .option("inferSchema", "true") 
    .option("delimiter", "\t") 
    .load("file:///team5/data/flatFile.csv")

df.printSchema() 
df.show(10)      

In [5]:
// Display the number of columns
val columnCount = df.columns.length
println(s"Number of columns: $columnCount")

In [6]:
// Display the number of rows
val rowCount = df.count()
println(s"Number of rows: $rowCount")

In [7]:
// For SQL Utilisation
df.createOrReplaceTempView("df_TBL")

<h1 align="center">flatFile_deduplicated</h1>

In [9]:
// Lecture DF ===> remove duplicate
val dfwd = spark.read
              .option("header", "true")
              .options(Map("inferSchema" -> "true", "delimiter" -> "\t"))
              .csv("file:///team5/data/flatFile.csv")
              .distinct

In [10]:
// Display the number of rows
val rowCount = dfwd.count()
println(s"Number of rows: $rowCount")

<h1 align="center">EDA Interesting Labels</h1>

In [12]:
// Create a new DataFrame named df2 with only the relevant columns
val df2 = df.select(
  col("puissance"),                // Engine power
  col("valeur_venale"),            // Current market value of the vehicle
  col("valeur_neuve"),             // Purchase value of the vehicle
  col("usage"),                    // Usage of the vehicle
  col("age_objet_assuree"),        // Age of the insured object
  col("anciennete"),               // Seniority of the policyholder
  col("age_client").alias("Age_client"), // Age of the client (alias used if case mismatch)
  col("classe"),                   // Classification of the vehicle/policy
  col("Type_renouvellement_police").alias("type_renouvellement_police"), // Renewal type
  col("IsToutRisque")              // Full risk insurance indicator
)

// Show the schema and a sample of the data to confirm
df2.printSchema()
df2.show(10)

In [13]:
// Import necessary functions from Spark SQL
import org.apache.spark.sql.functions._

// Create a DataFrame to calculate the number of missing (null) values for each column in df2
val missingValuesDF = df2.columns.map { colName =>
  // Count the number of null or missing values for each column
  val missingCount = df2.filter(col(colName).isNull).count()
  (colName, missingCount) // Store the column name and its corresponding missing count
}.toSeq.toDF("Column", "MissingCount") // Convert the results to a DataFrame with column names

// Sort the results by the number of missing values in descending order for better readability
val sortedMissingValuesDF = missingValuesDF.orderBy(desc("MissingCount"))

// Display the schema of the missing values DataFrame to confirm the structure
sortedMissingValuesDF.printSchema()

// Show the final results
sortedMissingValuesDF.show()


In [14]:
// Count the total number of rows in df2
val totalRows = df2.count()

// Remove duplicate rows and count the number of distinct rows
val distinctRows = df2.distinct().count()

// Calculate the number of duplicate rows by subtracting distinct rows from total rows
val duplicateCount = totalRows - distinctRows

// Print the results
println(s"Total rows in df2: $totalRows")
println(s"Distinct rows in df2: $distinctRows")
println(s"Duplicate rows in df2: $duplicateCount")

// Optional: Display duplicate rows, if they exist
if (duplicateCount > 0) {
  // Group by all columns and count occurrences of each row
  val duplicateRowsDF = df2.groupBy(df2.columns.map(col): _*) // Group by all columns
    .count() // Count occurrences of each unique row
    .filter(col("count") > 1) // Filter rows where count > 1 (indicating duplicates)
    .orderBy(desc("count")) // Optionally, sort by the number of duplicates

  // Show the duplicate rows
  duplicateRowsDF.show(truncate = false)
} else {
  println("No duplicate rows found in df2.")
}

In [15]:
// Remove duplicate rows from df2
val df2WithoutDuplicates = df2.dropDuplicates()

// Show the number of rows before and after removing duplicates
println(s"Total rows before removing duplicates: ${df2.count()}")
println(s"Total rows after removing duplicates: ${df2WithoutDuplicates.count()}")

// Optional: Show the DataFrame without duplicates
df2WithoutDuplicates.show(truncate = false)

### Data Processing and Cleaning Steps:

1. **DataFrame Creation (df2)**: 
   - We created a new DataFrame **`df2`** that contains a selection of relevant columns for classifying clients as "risky" or "non-risky". These columns include features like **puissance**, **valeur_venale**, **valeur_neuve**, **age_objet_assuree**, **anciennete**, **classe**, **type_renouvellement_police**, **IsToutRisque**, etc.

2. **Handling Missing Values**:
   - We identified and counted the missing values for each of these important columns using the **`isnull()`** function. This helped us understand where data cleaning is needed.

3. **Removing Duplicates**:
   - We checked for and removed duplicate rows in **`df2`** by using the **`dropDuplicates()`** function to ensure that the DataFrame only contained unique rows.

   - After removing duplicates, here are the results for **`df2`**:
     - **Total Rows before removing duplicates**: 4,105,377
     - **Total Rows after removing duplicates**: 84,369
     - **Duplicate Count**: 4,021,008

   - This reduction in duplicates ensures the DataFrame contains only relevant, unique entries, making it ready for further analysis.

Note: The DataFrame with duplicates removed is referred to as **`df2WithoutDuplicates`**.
