In [3]:
# Libraries
# Standard libraries
from datetime import datetime
from functools import reduce
 
# Thrid-party libraries
from ftplib import FTP
import pandas as pd

# CONNEXION au site :
Data informations :
https://www.ncei.noaa.gov/
https://www.ncei.noaa.gov/products/land-based-station/global-historical-climatology-network-daily

In [4]:

def get_ncdc_ftp_con(): 
  """
    A function that establishes ftp connection via the ftplib module.
    
    Arguments:
        
    Returns:
      ftplib.FTP 
  """
  
  # establish ftp connection
  # login using no credentials as this ftp is open source
  ftp = FTP('ftp.ncdc.noaa.gov')
  ftp.login('','')
  
  # return the ftp object
  return ftp

# Copier la list des dépots :

https://www.ncei.noaa.gov/pub/data/ghcn/

In [5]:
def get_file_from_ftp(ftp: FTP, file_to_download: str, file_store: str):
  """
    A function that copies file from ftp to a predifened location.
    
    Arguments:
        ftp - ftplib.FTP connection
        file_to_download - existing file path to ftp
        file_store - path file to store the results
    
    Returns:      
  """
  
  # try to download any given file
  # report on success and failure correspondingly
  try:
    ftp.retrbinary("RETR " + file_to_download, open(file_store, 'wb').write)
    print("Succsessfully downloaded file: " + file_to_download + " into: " + file_store)
  except:
    print("Unsuccessfull download!")

# Downloads weather data for speacific year range perio

https://www.ncei.noaa.gov/pub/data/ghcn/daily/by_year/

In [6]:
folder=weather_by_year_local_path = "C:/Users/u32118508/OneDrive - UPEC/Telechargement/Advance Analytics/data_acquisition/data_acquisition/weather/"

def get_weather_by_year(ftp: FTP, from_year: int, to_year: int, by_year_ftp_path: str = "pub/data/ghcn/daily/by_year/", local_path: str = folder) :
  """
    A function that downloads weather data for speacific year range period.
    
    Arguments:
        ftp - ftplib.FTP connection
        from_year  - int, 4 digit integer representing start period
        to_year - int, 4 digit integer representing end period
        by_year_ftp_path - path where files are stored on the ftp
        local_path - path where files are to be written locally
    
    Returns:
  """
  
  # get filenames on the server
  # specify location where files should be stored
  files_to_download = [by_year_ftp_path + str(x) + ".csv.gz" for x in range(from_year, to_year + 1, 1)]
  files_to_store = [local_path + str(x) + ".csv.gz" for x in range(from_year, to_year + 1, 1)]
  
  # download and store all files
  for file_to_download, file_to_store in zip(files_to_download, files_to_store):
    get_file_from_ftp(ftp, file_to_download, file_to_store)

# Importer les données des stations

In [7]:

def import_station_data(path_file: str):
  """
    A function that imports the weather station data.
    Mainly used for obtaining geolocation of the different stations.
    
    Arguments:
        path_file - location where the raw downloaded file is stored.
    
    Returns:
        DataFrame like object wit hstation meta data.
  """
  
  # since the file are fixed width we need to specify format (read instructions)
  # first headings, name of variables
  # second start and stop position for each varaible
  headings = ["ID", "LATITUDE", "LONGITUDE", "ELEVATION", "STATE", "NAME", "GSN FLAG", "HCN/CRN FLAG", "WMO ID"]
  colspecs = [(0, 11), (13, 20), (22, 30), (32, 37), (39, 40), (42, 71), (73, 75), (77, 79), (81, 85)]
  
  stations_data = pd.read_fwf(path_file, names=headings, header=None, colspecs=colspecs)
  
  # returning only merge ID and geolocation, as this is all that is needed from this source of data
  return stations_data.filter(["ID", "LATITUDE", "LONGITUDE"], axis=1)

In [8]:
def weather_import_yearly(year: int, weather_files_dir: str = folder):
  """
    A function that imports a single weather dataset (for one year).
    
    Arguments:
      year - integer specifying the year for which weather data to be imported
      weather_files_dir - the directory containing the downloaded yearly weather data archives
        
    Returns:
        DataFrame like object with the weather data for a specific year from all stations worldwide
  """
 
  # predifinge colnames as not in file
  # then read file and return only the 4 columns needed
  colnames = ['ID', 'YEAR_MONTH_DAY', 'ELEMENT', 'DATA_VALUE', 'M_FLAG', 'Q_FLAG', 'S_FLAG', 'OBS_TIME']
  wiy = pd.read_csv(weather_files_dir + str(year) + ".csv.gz", compression='gzip', names=colnames, low_memory=False) #
  
  return wiy.filter(['ID', 'YEAR_MONTH_DAY', 'ELEMENT', 'DATA_VALUE'], axis=1)

# Filtre par pays

In [9]:
def weather_select_country(weather_df: pd.DataFrame, country: str):
  """
    A function that filters weather dataframe by country.
    
    Arguments:
        weather_df - raw imported weather data for any given year
        country - two letter country code, "MO" for Morocco
    
    Returns:
        DataFrame like object filtered by country
  """
  
  return weather_df.loc[lambda x: x['ID'].str.slice(stop=2) == country]

# Traitement de données

In [10]:
def weather_dp_flow(years: list, country: int, station_data_file: str = folder+"/ghcnd-stations.txt", weather_files_dir: str = folder):
  """
    This function aims to outline the weather dataprep process
    
    Arguments:
        years - years for which data would be prepped in the final prepped weather dataset
        country - two letter code of country, "MO" for Morocco
        station_data_file - the location of the downloaded file which contains the weather stations
    
    Returns:
        DataFrame like object containing pivoted information where each row
        is unique by station and date of measurement taken, 
        containing only stations within selected countries
  """
  
  # map import and filter weather data function for all years' dataframes
  # and then reduce with concat function
    #select country (depend depays)
  weather_dfs = map(lambda x: weather_select_country(weather_import_yearly(x, weather_files_dir), country), years)
  weather_dfs = reduce(lambda x, y: pd.concat([x, y]), weather_dfs)
  # 1) take entire weather data
  # 2) pivot weather variables
  # 3) merge stations' geolocation
  return \
    weather_dfs. \
    pivot_table(index=['ID', 'YEAR_MONTH_DAY'], columns='ELEMENT', values='DATA_VALUE'). \
    reset_index(). \
    merge(import_station_data(station_data_file), on = ["ID"], how = "left")

# Statistiques descriptives

In [11]:
def describe_set(data):
  """
      A function to produce custom dataset descriptives (may be further expanded)
 
      Arguments: 
      data - DataFrame, datset to run descriptives for
 
      Returns: DataFrame with descriptive statitics for an input set
  """
  
  # 1) Obtain percentage of nas
  nas = data.isnull().sum() / data.shape[0] * 100.00
  
  # 2) Construct df with additional descriptive stats
  output = pd.DataFrame({'Variable': data.columns,
                         'Missing_Perc': nas}). \
  assign(
         Missing_count=data.isnull().sum(),
         N_Uniques=data.nunique(),
         Total_Obs=data.shape[0],
         Type=data.dtypes,
         Min=data.min(numeric_only=True),
         Median=data.median(numeric_only=True),
         Mean=data.mean(numeric_only=True),
         Max=data.max(numeric_only=True),
         STD=data.std(numeric_only=True)
        ) . \
  sort_values(by=['Type', 'Variable']) 
  
  # 3) Clean up
  output = output[['Variable', 'Type', 'Total_Obs', 'N_Uniques', 'Missing_Perc', 'Missing_count', 'Min', 'Median', 'Mean', 'Max', 'STD']]
  output.reset_index(drop=True, inplace=True)
  