# BIG DATA PROCESSIN PROJECT

In [46]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc, col, isnan, when, count, mean ,stddev, expr

from pyspark.sql.types import StructType, StructField, StringType, DoubleType

###  Create a SparkSession

In [10]:

spark = SparkSession.builder.appName("Happy Countries 2023").getOrCreate()

In [11]:
spark

## Data loading and cleaning

### Read CSV File into DataFrame

In [13]:
# Read CSV file into DataFrame with header
csv_file_path = "./data/WHR_2023.csv"
df = spark.read.csv(csv_file_path,header=True)

In [14]:
# Display schema of DataFrame
df.printSchema()

root
 |-- country: string (nullable = true)
 |-- region: string (nullable = true)
 |-- happiness_score: string (nullable = true)
 |-- gdp_per_capita: string (nullable = true)
 |-- social_support: string (nullable = true)
 |-- healthy_life_expectancy: string (nullable = true)
 |-- freedom_to_make_life_choices: string (nullable = true)
 |-- generosity: string (nullable = true)
 |-- perceptions_of_corruption: string (nullable = true)



### Read CSV File into DataFrame with my_schema

In [15]:
df_schema = StructType([
    StructField("Country", StringType(), True),
    StructField("Region", StringType(),True),
    StructField("Happiness_Score",DoubleType(), True),
    StructField("GDP_per_Capita",DoubleType(), True),
    StructField("Social_Support", DoubleType(), True),
    StructField("Healthy_Life_Expectancy", DoubleType(), True),
    StructField("Freedom_to_Make_Life_Choices", DoubleType(), True),
    StructField("Generosity", DoubleType(), True),
    StructField("Perceptions_of_Corruption", DoubleType(), True)

])

In [16]:
df = spark.read.csv(csv_file_path,header=True,schema=df_schema)

In [17]:
# Display schema of DataFrame
df.printSchema()

root
 |-- Country: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Happiness_Score: double (nullable = true)
 |-- GDP_per_Capita: double (nullable = true)
 |-- Social_Support: double (nullable = true)
 |-- Healthy_Life_Expectancy: double (nullable = true)
 |-- Freedom_to_Make_Life_Choices: double (nullable = true)
 |-- Generosity: double (nullable = true)
 |-- Perceptions_of_Corruption: double (nullable = true)



In [18]:
# Dsplay content of DataFrame
df.show(5)

+-----------+--------------------+---------------+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|    Country|              Region|Happiness_Score|GDP_per_Capita|Social_Support|Healthy_Life_Expectancy|Freedom_to_Make_Life_Choices|Generosity|Perceptions_of_Corruption|
+-----------+--------------------+---------------+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|    Finland|      Western Europe|          7.804|         1.888|         1.585|                  0.535|                       0.772|     0.126|                    0.535|
|    Denmark|      Western Europe|          7.586|         1.949|         1.548|                  0.537|                       0.734|     0.208|                    0.525|
|    Iceland|      Western Europe|           7.53|         1.926|          1.62|                  0.559|                       0.738|      0.25| 

### Calculating basic statistics

In [19]:
df.describe().show()

24/11/09 11:09:11 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+-----------+--------------------+------------------+------------------+------------------+-----------------------+----------------------------+-------------------+-------------------------+
|summary|    Country|              Region|   Happiness_Score|    GDP_per_Capita|    Social_Support|Healthy_Life_Expectancy|Freedom_to_Make_Life_Choices|         Generosity|Perceptions_of_Corruption|
+-------+-----------+--------------------+------------------+------------------+------------------+-----------------------+----------------------------+-------------------+-------------------------+
|  count|        137|                 137|               137|               137|               137|                    136|                         137|                137|                      137|
|   mean|       NULL|                NULL| 5.539795620437957|1.4069854014598542|1.1562116788321168|     0.3661764705882354|          0.5399999999999998|0.14847445255474462|      0.14589781021897807|
| std

### Null value analysis

In [20]:
null_counts = df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df.columns])
null_counts.show()

+-------+------+---------------+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|Country|Region|Happiness_Score|GDP_per_Capita|Social_Support|Healthy_Life_Expectancy|Freedom_to_Make_Life_Choices|Generosity|Perceptions_of_Corruption|
+-------+------+---------------+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+
|      0|     0|              0|             0|             0|                      1|                           0|         0|                        0|
+-------+------+---------------+--------------+--------------+-----------------------+----------------------------+----------+-------------------------+



In [21]:
df_cleaned= df.dropna(subset=["Healthy_Life_Expectancy"])

In [22]:
df_cleaned.count()

136

## Exploratory Data Analysis (EDA)

### Distribution of Happiness

In [23]:
happiness_stats = df_cleaned.select(
    mean(col("Happiness_Score")).alias("mean"),
    stddev(col("Happiness_Score")).alias("stddev"),
    expr("percentile_approx(Happiness_Score, 0.25)").alias("Q1"),
    expr("percentile_approx(Happiness_Score, 0.5)").alias("median"),
    expr("percentile_approx(Happiness_Score, 0.75)").alias("Q3")
)

happiness_stats.show()

+-----------------+------------------+-----+------+-----+
|             mean|            stddev|   Q1|median|   Q3|
+-----------------+------------------+-----+------+-----+
|5.544441176470589|1.1428405336717136|4.638| 5.684|6.334|
+-----------------+------------------+-----+------+-----+



## Correlation of factors with happiness

To find the correlation between each factor and the Happiness_Score, we will use the correlation function in PySpark for each factor. Here we assume that there are several columns of factors, for example: GDP_Per_Capita, Social_Support, Life_Expectancy, etc.

In [24]:
from pyspark.sql.functions import corr
from pyspark.sql import Row

In [25]:
# List of factors we want to correlate with happines_score
factor_columns = ["GDP_per_Capita","Social_Support","Healthy_Life_Expectancy","Freedom_to_Make_Life_Choices","Generosity","Perceptions_of_Corruption"]

In [26]:
correlation_data = []

In [27]:
# Calculate the correlation of each factor
for factor in factor_columns:
    correlation = df_cleaned.select(corr("Happiness_Score",factor).alias("correlation")).collect()[0]["correlation"]
    print(f"Correlation between Happiness_Score and {factor}: {correlation}")
    correlation_data.append(Row(Factor=factor,Correlation=correlation))

Correlation between Happiness_Score and GDP_per_Capita: 0.7838113277907759
Correlation between Happiness_Score and Social_Support: 0.8381606979596413
Correlation between Happiness_Score and Healthy_Life_Expectancy: 0.7466992875668792
Correlation between Happiness_Score and Freedom_to_Make_Life_Choices: 0.6619437392392364
Correlation between Happiness_Score and Generosity: 0.0393946164734588
Correlation between Happiness_Score and Perceptions_of_Corruption: 0.4705684381401544


In [28]:
correlation_df = spark.createDataFrame(correlation_data)
correlation_df = correlation_df.orderBy("Correlation", ascending=False)
correlation_df.show()

+--------------------+------------------+
|              Factor|       Correlation|
+--------------------+------------------+
|      Social_Support|0.8381606979596413|
|      GDP_per_Capita|0.7838113277907759|
|Healthy_Life_Expe...|0.7466992875668792|
|Freedom_to_Make_L...|0.6619437392392364|
|Perceptions_of_Co...|0.4705684381401544|
|          Generosity|0.0393946164734588|
+--------------------+------------------+



In [29]:
correlation_df.show(1)

+--------------+------------------+
|        Factor|       Correlation|
+--------------+------------------+
|Social_Support|0.8381606979596413|
+--------------+------------------+
only showing top 1 row



### Which is the “happiest” country in 2023 according to the data?

In [52]:
max_happy = df_cleaned.groupBy('Country') \
        .agg(max(col("Happiness_Score")).alias("Max_Happiness_Score")) \
        .orderBy(col("Max_Happiness_Score").desc())\
        .limit(1)

In [53]:
max_happy.show()

+-------+-------------------+
|Country|Max_Happiness_Score|
+-------+-------------------+
|Finland|              7.804|
+-------+-------------------+



## Visualization

## Predictive Model 

In [54]:
spark.stop()
