In [None]:
# The objective of this file is to provide the transform_GMMC function  

# def transform_GMMC(data: pd.DataFrame) -> pd.DataFrame:
#     """Transform the dataset into the desired structure and filters."""
#     logging.info(f"Transforming data, initial size: {data.shape}")
#     # Example transformation: adding more realistic transformations
#     # This is a placeholder. Actual transformations depend on the specific needs.
#     df = data
#     logging.info(f"Transformed data size: {df.shape}")
#     return df

In [None]:
from etl import extract_csv
import pandas as pd

path='data_storage/GMMC/raw_data/GMMC-2020-M.csv'
total_path='../'+path
df= extract_csv(total_path)

In [None]:
df.head(3)
# How to show ful length of the columns
pd.set_option('display.max_columns', None)
df.head(3)
df.shape

In [None]:
# Rename columns for clarity
df.rename(columns={
    '@id': 'ID',
    'sample.samplingPoint': 'Sampling Point',
    'sample.samplingPoint.notation': 'Sampling Point Notation',
    'sample.samplingPoint.label': 'Sampling Point Label',
    'sample.sampleDateTime': 'Sample Date and Time',
    'determinand.label': 'Determinand Label',
    'determinand.definition': 'Determinand Definition',
    'determinand.notation': 'Determinand Notation',
    'resultQualifier.notation': 'Result Qualifier Notation',
    'result': 'Result',
    'codedResultInterpretation.interpretation': 'Result Interpretation',
    'determinand.unit.label': 'Unit',
    'sample.sampledMaterialType.label': 'Sample Material Type',
    'sample.isComplianceSample': 'Is Compliance Sample',
    'sample.purpose.label': 'Sample Purpose',
    'sample.samplingPoint.easting': 'Easting',
    'sample.samplingPoint.northing': 'Northing'
}, inplace=True)

df.head(3)

In [None]:
# determine the number of missing values in each column
missing_values = df.isnull().sum()
missing_values

In [None]:
# drop Result Interpretation column
df.drop(columns=['Result Interpretation','Result Qualifier Notation'], inplace=True)
# determine the number of missing values in each column
missing_values = df.isnull().sum()
missing_values

In [None]:
features = ['ID', 'Sampling Point', 'Sampling Point Notation', 'Sampling Point Label', 'Sample Date and Time', 'Determinand Label', 'Determinand Definition', 'Determinand Notation', 'Result Qualifier Notation', 'Result', 'Result Interpretation', 'Unit', 'Sample Material Type', 'Is Compliance Sample', 'Sample Purpose', 'Easting', 'Northing']

# get numerical columns
numerical_columns =['Determinand Notation', 'Result', 'Easting',
       'Northing']

# Get text columns those that are not numerical
text_columns = [col for col in features if col not in numerical_columns]
text_columns

The column 'Sampling Point' is the same as 'Sampling Point Notation' e.g. http://environment.data.gov.uk/water-quality/id/sampling-point/NW-1086
and NW-1086

In [None]:
df.drop(columns=['Sampling Point'], inplace=True)


In [None]:
# df[text_columns].head(3)
df[['ID']].head(3)
#

In [None]:

# In id we have http://environment.data.gov.uk/water-quality/data/measurement/NW-5286182-0135
# We need to extract the last part of the string
df['ID'] = df['ID'].str.split('/').str[-1]
df[['ID']].head(3)

In [None]:
df[numerical_columns].head(3)

In [None]:
df[['Sample Date and Time']].head(3)

In [None]:

# Convert the Sample Date and Time to datetime
df['Sample Date and Time'] = pd.to_datetime(df['Sample Date and Time'])
df[['Sample Date and Time']].head(3)


In [None]:
df.head(3)

In [None]:
import pandas as pd
import logging
from etl import extract_csv

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def transform_GMMC(data: pd.DataFrame) -> pd.DataFrame:
    """
    Transform the dataset into the desired structure and filters.
    
    Parameters:
    data (pd.DataFrame): Input dataframe to be transformed.
    
    Returns:
    pd.DataFrame: Transformed dataframe.
    """
    logging.info(f"Transforming data, initial size: {data.shape}")
    
    # Rename columns for clarity
    data.rename(columns={
        '@id': 'ID',
        'sample.samplingPoint': 'Sampling Point',
        'sample.samplingPoint.notation': 'Sampling Point Notation',
        'sample.samplingPoint.label': 'Sampling Point Label',
        'sample.sampleDateTime': 'Sample Date and Time',
        'determinand.label': 'Determinand Label',
        'determinand.definition': 'Determinand Definition',
        'determinand.notation': 'Determinand Notation',
        'resultQualifier.notation': 'Result Qualifier Notation',
        'result': 'Result',
        'codedResultInterpretation.interpretation': 'Result Interpretation',
        'determinand.unit.label': 'Unit',
        'sample.sampledMaterialType.label': 'Sample Material Type',
        'sample.isComplianceSample': 'Is Compliance Sample',
        'sample.purpose.label': 'Sample Purpose',
        'sample.samplingPoint.easting': 'Easting',
        'sample.samplingPoint.northing': 'Northing'
    }, inplace=True)
    
    # Drop unnecessary columns
    data.drop(columns=['Result Interpretation', 'Result Qualifier Notation', 'Sampling Point'], inplace=True)
    
    # Extract the last part of the string from the ID column
    data['ID'] = data['ID'].str.split('/').str[-1]
    
    # Convert the Sample Date and Time to datetime
    data['Sample Date and Time'] = pd.to_datetime(data['Sample Date and Time'])
    
    logging.info(f"Transformed data size: {data.shape}")
    return data

def main():
    # Path to the data file
    path = 'data_storage/GMMC/raw_data/GMMC-2020-M.csv'
    total_path = '../' + path
    
    try:
        # Extract the CSV data
        df = extract_csv(total_path)
        
        # Perform the transformation
        transformed_df = transform_GMMC(df)
        
        # Display the first few rows of the transformed dataframe (for debugging purposes)
        logging.info(f"First few rows of transformed data:\n{transformed_df.head(3)}")
        
    except Exception as e:
        logging.error(f"An error occurred: {e}")

if __name__ == "__main__":
    main()
