In [1]:
!pip3 install seaborn
!pip3 install matplotlib

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 KB[0m [31m154.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting matplotlib!=3.6.1,>=3.4
  Downloading matplotlib-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting contourpy>=1.0.1
  Downloading contourpy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (322 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.0/322.0 KB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cycler>=0.10
  Downloading cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting kiwisolver>=1.3.1
  Downloading kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
import os

In [3]:
class DBConfig:
    HOST = os.environ.get("DB_HOST", "localhost")
    PORT = os.environ.get("DB_PORT", "5432")
    DATABASE = os.environ.get("DB_NAME", "postgres")
    USER = os.environ.get("DB_USER", "postgres")
    PASSWORD = os.environ.get("DB_PASSWORD", "password")

    def get_url(self):
        """Create a connection URL to the PostgreSQL database."""
        return f"jdbc:postgresql://{self.HOST}:{self.PORT}/{self.DATABASE}"

    def get_conn(self):
        """Create a connection to the PostgreSQL database."""
        return psycopg2.connect(
            host=self.HOST,
            port=self.PORT,
            dbname=self.DATABASE,
            user=self.USER,
            password=self.PASSWORD,
        )

In [4]:
spark = SparkSession.builder.appName("FlightDelaysAnalysis").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/20 20:44:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
def load_data():
    jdbc_url = DBConfig().get_url()
    properties = {
        "user": DBConfig.USER,
        "password": DBConfig.PASSWORD,
        "driver": "org.postgresql.Driver"
    }

    # Charger les données depuis PostgreSQL
    df = spark.read.jdbc(
        url=jdbc_url,
        table="on_time_performance",
        properties=properties
    )

    # Sélectionner les colonnes nécessaires et filtrer les retards non nuls
    df = df.select("Year", "FlightDate", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay")
    df = df.filter(
        (col("CarrierDelay").isNotNull()) |
        (col("WeatherDelay").isNotNull()) |
        (col("NASDelay").isNotNull()) |
        (col("SecurityDelay").isNotNull()) |
        (col("LateAircraftDelay").isNotNull())
    )
    return df

In [6]:
def analyze_data(df):
    # Calculer la somme des différents types de retards par année
    delay_totals = df.groupBy("Year").agg(
        spark_sum("CarrierDelay").alias("TotalCarrierDelay"),
        spark_sum("WeatherDelay").alias("TotalWeatherDelay"),
        spark_sum("NASDelay").alias("TotalNASDelay"),
        spark_sum("SecurityDelay").alias("TotalSecurityDelay"),
        spark_sum("LateAircraftDelay").alias("TotalLateAircraftDelay")
    )

    # Convertir en Pandas pour la visualisation
    delay_totals_pd = delay_totals.toPandas()
    delay_totals_pd = delay_totals_pd.sort_values("Year")
    
    print(delay_totals_pd.head())  # Afficher un aperçu des résultats
    return delay_totals_pd

In [7]:
def plot_data(delay_totals_pd):
    sns.set(style="whitegrid")

    # Graphique 1 : Répartition des retards par cause pour chaque année
    plt.figure(figsize=(10, 6))
    delay_totals_pd.plot(x="Year", kind="bar", stacked=True, 
                         y=["TotalCarrierDelay", "TotalWeatherDelay", "TotalNASDelay", 
                            "TotalSecurityDelay", "TotalLateAircraftDelay"],
                         color=['blue', 'orange', 'green', 'red', 'purple'])
    plt.title('Total Delays by Cause (Stacked) Over Years')
    plt.ylabel('Total Delay (Minutes)')
    plt.xlabel('Year')
    plt.show()

    # Graphique 2 : Détails des retards par type pour chaque année
    plt.figure(figsize=(10, 6))
    for delay_type in ["TotalCarrierDelay", "TotalWeatherDelay", "TotalNASDelay", "TotalSecurityDelay", "TotalLateAircraftDelay"]:
        sns.lineplot(x="Year", y=delay_type, data=delay_totals_pd, label=delay_type)

    plt.title('Evolution of Delays by Cause Over Time')
    plt.ylabel('Total Delay (Minutes)')
    plt.xlabel('Year')
    plt.legend(loc="upper left")
    plt.show()

In [8]:
def main():
    # Charger les données
    df = load_data()

    # Effectuer des analyses
    delay_totals_pd = analyze_data(df)

    # Afficher des graphiques
    plot_data(delay_totals_pd)

In [9]:
main()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `Year` cannot be resolved. Did you mean one of the following? [`NASDelay`, `FlightDate`, `WeatherDelay`, `CarrierDelay`, `SecurityDelay`].;
'Aggregate ['Year], ['Year, sum(CarrierDelay#20) AS TotalCarrierDelay#63L, sum(WeatherDelay#21) AS TotalWeatherDelay#65L, sum(NASDelay#22) AS TotalNASDelay#67L, sum(SecurityDelay#23) AS TotalSecurityDelay#69L, sum(LateAircraftDelay#24) AS TotalLateAircraftDelay#71L]
+- Filter ((((isnotnull(CarrierDelay#20) OR isnotnull(WeatherDelay#21)) OR isnotnull(NASDelay#22)) OR isnotnull(SecurityDelay#23)) OR isnotnull(LateAircraftDelay#24))
   +- Project [FlightDate#0, CarrierDelay#20, WeatherDelay#21, NASDelay#22, SecurityDelay#23, LateAircraftDelay#24]
      +- Relation [flightdate#0,year#1,month#2,dayofmonth#3,dayofweek#4,originairportid#5,destairportid#6,operating_airline#7,crsdeptime#8,deptime#9,crsarrtime#10,arrtime#11,wheelsoff#12,wheelson#13,cancelled#14,crselapsedtime#15,actualelapsedtime#16,airtime#17,flights#18,distance#19,carrierdelay#20,weatherdelay#21,nasdelay#22,securitydelay#23,lateaircraftdelay#24] JDBCRelation(on_time_performance) [numPartitions=1]
