In [0]:
%sh
ls /team5/data/

In [1]:
%spark
sc.hadoopConfiguration.set("fs.defaultFS", "hdfs://namenode:9000")

<h1> Exploring data of file : LabeledFile.csv </h1>

In [3]:
%spark
val DF = spark.read
    .format("csv")               
    .option("header", "true")    
    .option("inferSchema", "true") // Infers column data types
    .option("delimiter", "\t")   // Specify tab as the delimiter
    .load("file:///team5/data/LabeledFile.csv")

DF.printSchema() // Prints the schema of the DataFrame

In [4]:
%spark
DF.select("N_SOUSCRIP", "N_POLICE", "N_OBJET_ASS", "year", "Prime","Sinistre","marque").show()
DF.select( "puissance","carrosserie","energie","age_objet_assuree","valeur_venale","valeur_neuve").show()
DF.select( "Charge_utile","usage","place","gouvernorat","anciennete","activite","classe","delegation").show()
DF.select( "age_client","civilite","sexe","centre","direction_regionale","type_vehicule").show()
DF.select( "Type_renouvellement_police","fractionnement","nombre_fractions","IsToutRisque","Risky").show()

In [5]:
%spark
DF.count()

In [6]:
%spark
//summary of the data
DF.select("N_SOUSCRIP", "N_POLICE", "N_OBJET_ASS", "year", "Prime","Sinistre","marque").describe().show()
DF.select( "puissance","carrosserie","energie","age_objet_assuree","valeur_venale","valeur_neuve").describe().show()
DF.select( "Charge_utile","usage","place","gouvernorat","anciennete","activite","classe","delegation").describe().show()
DF.select( "age_client","civilite","sexe","centre","direction_regionale","type_vehicule").describe().show()
DF.select( "Type_renouvellement_police","fractionnement","nombre_fractions","IsToutRisque","Risky").describe().show()

In [7]:
%spark
import org.apache.spark.sql.functions._

// Define column groups (for better organization in output)
val group1 = Seq("N_SOUSCRIP", "N_POLICE", "N_OBJET_ASS", "year", "Prime", "Sinistre", "marque","puissance", "carrosserie", "energie", "age_objet_assuree", "valeur_venale", "valeur_neuve","Charge_utile", "usage", "place", "gouvernorat", "anciennete")
val group2 = Seq( "activite", "classe", "delegation","age_client", "civilite", "sexe", "centre", "direction_regionale", "type_vehicule","Type_renouvellement_police", "fractionnement", "nombre_fractions", "IsToutRisque","Risky")

// Function to calculate null counts for a group of columns
def countNullsForGroup(group: Seq[String], df: org.apache.spark.sql.DataFrame): Unit = {
  val nullCounts = group.map { colName =>
    count(when(col(colName).isNull || col(colName) === "", colName)).alias(colName)
  }
  df.select(nullCounts: _*).show(truncate = false)
}

// Calculate and display null counts for each group
println("Group 1 Null Counts:")
countNullsForGroup(group1, DF)

println("Group 2 Null Counts:")
countNullsForGroup(group2, DF)


In [8]:
%spark
//Count distinct values

val distinctCounts = DF.columns.map { colName =>
  val distinctCount = DF.select(col(colName)).distinct().count()
  (colName, distinctCount)
}


distinctCounts.foreach { case (colName, count) =>
  println(s"Column '$colName' has $count distinct values.")
}

In [9]:
%spark
import org.apache.spark.sql.types._

// Extract numerical columns
val numericalColumns = DF.schema.fields
  .filter(f => f.dataType == DoubleType || f.dataType == IntegerType || f.dataType == FloatType)
  .map(_.name)


In [10]:
import spark.implicits._
import org.apache.spark.sql.functions._


// Compute pairwise correlations
val correlationMatrix = numericalColumns.flatMap { col1 =>
  numericalColumns.map { col2 =>
    val corrValue = if (col1 == col2) 1.0 else DF.stat.corr(col1, col2)
    (col1, col2, corrValue) // Create tuple
  }
}.toSeq // Convert Array to Seq

// Convert Seq to DataFrame
val correlationDF = correlationMatrix.toDF("Column1", "Column2", "Correlation")

// Display results
correlationDF.show(truncate = false)



In [11]:
import org.apache.spark.sql.types._

// Extract non-numerical columns
val nonNumericalColumns = DF.schema.fields
  .filter(f => f.dataType != DoubleType && f.dataType != IntegerType && f.dataType != FloatType)
  .map(_.name)

// Print the non-numerical columns
println(nonNumericalColumns.mkString(", "))


In [12]:
DF.select("N_OBJET_ASS").distinct().show()
DF.select("marque").distinct().show()
DF.select("carrosserie").distinct().show()
DF.select("energie").distinct().show()
DF.select("usage").distinct().show()
DF.select("gouvernorat").distinct().show()
DF.select("activite").distinct().show()
DF.select("delegation").distinct().show()
DF.select("civilite").distinct().show()
DF.select("sexe").distinct().show()
DF.select("centre").distinct().show()
DF.select("direction_regionale").distinct().show()
DF.select("type_vehicule").distinct().show()
DF.select("Type_renouvellement_police").distinct().show()
DF.select("fractionnement").distinct().show()
DF.select("nombre_fractions").distinct().show()
DF.select("IsToutRisque").distinct().show()

In [13]:
DF.createOrReplaceTempView("dataset")

In [14]:
%sql
SELECT 
    Risky,count(Risky) As count
FROM dataset
GROUP BY Risky
ORDER BY count DESC;

In [15]:
%%sql
