In [16]:

import pandas as pd
import numpy as np
from datetime import datetime
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import jaccard_score
from Levenshtein import distance as lev_distance

In [17]:
# Step 2: Load CSV Files
merged_cassandra = pd.read_csv('merged/merged_cassandra.csv')
merged_mongo = pd.read_csv('merged/merged_mongo.csv')
merged_postgres = pd.read_csv('merged/merged_postgres.csv')
merged_web = pd.read_csv('merged/merged_web.csv')
merged_xml = pd.read_csv('merged/merged_xml.csv')
merged_mysql = pd.read_csv('merged/merged_mysql.csv')


merged_cassandra.head(), merged_mongo.head(), merged_postgres.head(
), merged_web.head(), merged_xml.head(), merged_mysql.head()

(                               ClientID    ClientName        Address  \
 0  ea614ade-9cee-43ba-bb90-319f7079f8dc    Customer_4    Rua 4, Faro   
 1  ea614ade-9cee-43ba-bb90-319f7079f8dc    Customer_4    Rua 4, Faro   
 2  ea614ade-9cee-43ba-bb90-319f7079f8dc    Customer_4    Rua 4, Faro   
 3  73688418-9121-4b41-bc92-0a14a9b060ed  Customer_127  Rua 127, Faro   
 4  73688418-9121-4b41-bc92-0a14a9b060ed  Customer_127  Rua 127, Faro   
 
    ContactNumber     AreaType                  AreaDescription Classification  \
 0    35191000004  Residential     Residential area for housing        General   
 1    35191000004  Residential     Residential area for housing        General   
 2    35191000004  Residential     Residential area for housing        General   
 3    35192000060   Industrial  Industrial zones with factories        Premium   
 4    35192000060   Industrial  Industrial zones with factories        Premium   
 
   ClassificationDescription     AccountStatus  \
 0     General l

In [18]:

def normalize_dates(df, date_columns):
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
    return df



date_columns_cassandra = ['StartDate',
                          'EndDate', 'EffectiveDate', 'LastUpdated']
date_columns_mongo = ['ContractStartDate',
                      'ContractEndDate', 'PublicationDate', 'EffectiveDate']
date_columns_postgres = ['LastCollectionDate',
                         'NextCollectionDate', 'DisposalDate', 'BillingDate']
date_columns_web = ['Time', 'InstallationDate']
date_columns_xml = ['Date']
date_columns_mysql = ['startdate', 'enddate', 'usagedate',
                      'participationstartdate', 'lastinspectiondate', 'maintenancedate', 'testdate']


merged_cassandra = normalize_dates(merged_cassandra, date_columns_cassandra)
merged_mongo = normalize_dates(merged_mongo, date_columns_mongo)
merged_postgres = normalize_dates(merged_postgres, date_columns_postgres)
merged_web = normalize_dates(merged_web, date_columns_web)
merged_xml = normalize_dates(merged_xml, date_columns_xml)
merged_mysql = normalize_dates(merged_mysql, date_columns_mysql)

  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True)
  df[col] = pd.to_datetime(df[col], errors='coer

In [19]:
# Step 4: Normalize Contact Information
def normalize_contacts(df, contact_columns):
    for col in contact_columns:
        if col in df.columns:
            # Ensure contact is treated as a string
            df[col] = df[col].astype(str)
    return df


# Define the contact columns for each dataset
contact_columns_cassandra = ['ContactNumber']
contact_columns_mongo = ['ContactInfo']
contact_columns_postgres = ['MobileInfo']
# If sensor IDs need normalization, adjust as needed
contact_columns_web = ['SensorID']
contact_columns_xml = ['Contact']
contact_columns_mysql = ['contactinfo']

# Apply normalization
merged_cassandra = normalize_contacts(
    merged_cassandra, contact_columns_cassandra)
merged_mongo = normalize_contacts(merged_mongo, contact_columns_mongo)
merged_postgres = normalize_contacts(merged_postgres, contact_columns_postgres)
merged_web = normalize_contacts(merged_web, contact_columns_web)
merged_xml = normalize_contacts(merged_xml, contact_columns_xml)
merged_mysql = normalize_contacts(merged_mysql, contact_columns_mysql)

In [20]:
# Step 5: Normalize Other Columns (Optional)
def normalize_numerical(df, numerical_columns):
    for col in numerical_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df


# Example numerical columns (you can adjust this list based on your data)
numerical_columns = ['BaseFee', 'TotalAmount',
                     'QuantityInKg', 'VolumeUsed', 'ProjectedSavings']

# Apply normalization to numerical columns
merged_cassandra = normalize_numerical(merged_cassandra, numerical_columns)
merged_mongo = normalize_numerical(merged_mongo, numerical_columns)
merged_postgres = normalize_numerical(merged_postgres, numerical_columns)
merged_web = normalize_numerical(merged_web, numerical_columns)
merged_xml = normalize_numerical(merged_xml, numerical_columns)
merged_mysql = normalize_numerical(merged_mysql, numerical_columns)

In [21]:
# Step 6: Jaccard Similarity on Column Names
def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    return len(set1.intersection(set2)) / len(set1.union(set2))


# Get the column names for each dataset
columns_cassandra = merged_cassandra.columns.tolist()
columns_mongo = merged_mongo.columns.tolist()
columns_postgres = merged_postgres.columns.tolist()
columns_web = merged_web.columns.tolist()
columns_xml = merged_xml.columns.tolist()
columns_mysql = merged_mysql.columns.tolist()

# Compute the Jaccard similarity between column names
jaccard_cassandra_mongo = jaccard_similarity(columns_cassandra, columns_mongo)
jaccard_cassandra_postgres = jaccard_similarity(
    columns_cassandra, columns_postgres)
jaccard_cassandra_web = jaccard_similarity(columns_cassandra, columns_web)
jaccard_cassandra_xml = jaccard_similarity(columns_cassandra, columns_xml)
jaccard_cassandra_mysql = jaccard_similarity(columns_cassandra, columns_mysql)

# Display results
print("Jaccard Similarity Scores:")
print(f"Cassandra vs Mongo: {jaccard_cassandra_mongo}")
print(f"Cassandra vs Postgres: {jaccard_cassandra_postgres}")
print(f"Cassandra vs Web: {jaccard_cassandra_web}")
print(f"Cassandra vs XML: {jaccard_cassandra_xml}")
print(f"Cassandra vs MySQL: {jaccard_cassandra_mysql}")

Jaccard Similarity Scores:
Cassandra vs Mongo: 0.12244897959183673
Cassandra vs Postgres: 0.05714285714285714
Cassandra vs Web: 0.0
Cassandra vs XML: 0.1
Cassandra vs MySQL: 0.0


In [22]:
# Step 7: Levenshtein Similarity on Column Names
def levenshtein_similarity(str1, str2):
    return 1 - (lev_distance(str1, str2) / max(len(str1), len(str2)))


# Compute Levenshtein similarity for some example column pairs
levenshtein_cassandra_mongo = [levenshtein_similarity(
    c, m) for c in columns_cassandra for m in columns_mongo]
levenshtein_cassandra_postgres = [levenshtein_similarity(
    c, p) for c in columns_cassandra for p in columns_postgres]

# Example results for a few column names
print("Levenshtein Similarity Scores:")
print(f"Levenshtein Scores between Cassandra and Mongo columns: {levenshtein_cassandra_mongo[:10]}")
print(f"Levenshtein Scores between Cassandra and Postgres columns: {levenshtein_cassandra_postgres[:10]}")

Levenshtein Similarity Scores:
Levenshtein Scores between Cassandra and Mongo columns: [0.125, 0.125, 0.16666666666666663, 0.0, 0.2727272727272727, 0.36363636363636365, 0.4, 0.17647058823529416, 0.19999999999999996, 0.16666666666666663]
Levenshtein Scores between Cassandra and Postgres columns: [1.0, 0.125, 0.09090909090909094, 0.0, 0.19999999999999996, 0.0, 0.21052631578947367, 0.2777777777777778, 0.2777777777777778, 0.11111111111111116]


In [23]:
# Step 8: Analyze and Integrate Best Matches (Example)

# Create a dictionary to store the best matches for each pair of datasets
best_matches = {
    'Cassandra_Mongo': (jaccard_cassandra_mongo, levenshtein_cassandra_mongo),
    'Cassandra_Postgres': (jaccard_cassandra_postgres, levenshtein_cassandra_postgres),
}

# Print the best matches
for pair, scores in best_matches.items():
    print(f"Best matches for {pair}:")
    print(f"Jaccard Similarity: {scores[0]}")
    print(f"Levenshtein Similarity: {scores[1]}")

Best matches for Cassandra_Mongo:
Jaccard Similarity: 0.12244897959183673
Levenshtein Similarity: [0.125, 0.125, 0.16666666666666663, 0.0, 0.2727272727272727, 0.36363636363636365, 0.4, 0.17647058823529416, 0.19999999999999996, 0.16666666666666663, 0.375, 0.21052631578947367, 0.19999999999999996, 0.19999999999999996, 0.2666666666666667, 0.23076923076923073, 0.07692307692307687, 0.30000000000000004, 0.375, 0.0, 0.125, 0.14814814814814814, 0.25, 0.2857142857142857, 0.10344827586206895, 0.0, 0.09375, 0.08571428571428574, 0.125, 0.09999999999999998, 0.1428571428571429, 0.11428571428571432, 0.2222222222222222, 0.18181818181818177, 0.125, 0.09999999999999998, 0.4, 0.16666666666666663, 0.0, 0.18181818181818177, 0.09090909090909094, 0.19999999999999996, 0.23529411764705888, 0.2666666666666667, 0.2222222222222222, 0.30000000000000004, 0.21052631578947367, 0.5, 0.09999999999999998, 0.33333333333333337, 0.3076923076923077, 0.11538461538461542, 0.09999999999999998, 0.09999999999999998, 0.3000000000

In [24]:
# Step 1: Define the Global Schema (G)
global_schema = {
    'ClientID': str,
    'Name': str,
    'Address': str,
    'ContactInfo': str,
    'ProgramName': str,
    'StartDate': str,
    'EndDate': str,
    'Status': str,
    'PolicyName': str,
    'BillingAmount': float,
    'LastInspectionDate': str
}

# Example global schema (adjust as needed based on your use case)

In [25]:
# Step 2: LAV Transformation Logic
def lav_transform(df, schema_mapping):
    """
    Transforms the local dataset into the global schema view using a provided mapping.
    """
    transformed_data = {}

    for global_col, local_col in schema_mapping.items():
        if local_col in df.columns:
            transformed_data[global_col] = df[local_col]
        else:
            # Set as None if the local column doesn't exist
            transformed_data[global_col] = None

    return pd.DataFrame([transformed_data])


# Define schema mappings for LAV
schema_mapping_cassandra = {
    'ClientID': 'ClientID', 'Name': 'ClientName', 'Address': 'Address', 'ContactInfo': 'ContactNumber',
    'ProgramName': 'PolicyName', 'StartDate': 'StartDate', 'EndDate': 'EndDate', 'Status': 'ContractStatus',
    'PolicyName': 'PolicyName', 'BillingAmount': 'BaseFee', 'LastInspectionDate': 'LastUpdated'
}

# Apply LAV transformation to Cassandra
lav_cassandra = lav_transform(merged_cassandra, schema_mapping_cassandra)
lav_cassandra.head()  # Display the transformed data

Unnamed: 0,ClientID,Name,Address,ContactInfo,ProgramName,StartDate,EndDate,Status,PolicyName,BillingAmount,LastInspectionDate
0,0 ea614ade-9cee-43ba-bb90-319f7079f8dc 1 ...,0 Customer_4 1 Customer_4 2 ...,"0 Rua 4, Faro 1 Rua 4, Faro 2 ...",0 35191000004 1 35191000004 2 3519...,0 Policy 3 1 Policy 1 2 Policy 2 3...,0 2024-01-26 1 2024-01-26 2 2024-01-2...,0 2026-01-25 1 2026-01-25 2 2026-01-2...,0 Terminated 1 Terminated 2 Termin...,0 Policy 3 1 Policy 1 2 Policy 2 3...,0 100.0 1 100.0 2 100.0 3 100....,0 2025-01-25 17:03:24.432 1 2025-01-25 1...


In [26]:
def reset_index_with_check(df):
    """
    Reset the index of the dataframe and ensure the index is unique.
    """
    df_reset = df.reset_index(
        drop=True)  # Reset the index and drop the old one
    if not df_reset.index.is_unique:
        print("Warning: The index is not unique in this dataframe!")
    return df_reset


# Example schema mappings for LAV transformations (adjust these to match your data)
schema_mapping_mongo = {
    '_id': 'ClientID',  # Renaming '_id' to 'ClientID' since '_id' is the actual column name
    'Name': 'Name',
    'Address': 'Address',
    'ContactInfo': 'ContactInfo',
    'ProgramName': 'ProgramName',
    'StartDate': 'StartDate',
    'EndDate': 'EndDate',
    'Status': 'Status',
    'PolicyName': 'PolicyName',
    'BillingAmount': 'ProjectedSavings',
    'LastInspectionDate': 'PublicationDate'
}

schema_mapping_postgres = {
    'ClientID': 'ClientID',
    'Name': 'Name',
    'Address': 'FullAddress',
    'ContactInfo': 'MobileInfo',
    'ProgramName': 'WasteType',
    'StartDate': 'LastCollectionDate',
    'EndDate': 'NextCollectionDate',
    'Status': 'Status',
    'PolicyName': 'WasteType',
    'BillingAmount': 'TotalAmount',
    'LastInspectionDate': 'BillingDate'
}

# Updated schema mapping for Web dataset based on the column names you provided
schema_mapping_web = {
    'ReportID': 'ClientID',  # Mapping ReportID to ClientID
    'Name': 'SensorType',
    'Address': 'Location',
    'ContactInfo': 'ReportURL',
    'ProgramName': 'Temperature',
    'StartDate': 'Time',
    'EndDate': 'Time',
    'Status': 'Validation',
    'PolicyName': 'pH',
    'BillingAmount': 'Turbidity',
    'LastInspectionDate': 'Time'
}

In [27]:
# Apply LAV transformation for Mongo, Postgres, and Web datasets
lav_mongo = lav_transform(merged_mongo, schema_mapping_mongo)
lav_postgres = lav_transform(merged_postgres, schema_mapping_postgres)
lav_web = lav_transform(merged_web, schema_mapping_web)

In [28]:
lav_mongo.columns

Index(['_id', 'Name', 'Address', 'ContactInfo', 'ProgramName', 'StartDate',
       'EndDate', 'Status', 'PolicyName', 'BillingAmount',
       'LastInspectionDate'],
      dtype='object')

In [29]:

# Reset the index of the dataframe and ensure the index is unique.

# Step 3: GAV Transformation Logic
def reset_index_with_check(df):
    """
    Reset the index of the dataframe and ensure the index is unique.
    """
    df_reset = df.reset_index(
        drop=True)  # Reset the index and drop the old one
    if not df_reset.index.is_unique:
        print("Warning: The index is not unique in this dataframe!")
    return df_reset

# Step 3: GAV Transformation Logic


def gav_transform(local_dfs):
    """
    Generates a global schema view by combining data from all local sources.
    The local data sources should be merged into a single view.
    """
    # Reset index for each DataFrame to avoid invalid index errors during concatenation
    local_dfs_reset = [reset_index_with_check(df) for df in local_dfs]

    # Concatenate the dataframes with ignore_index=True to ensure a unique index for the combined data
    combined_data = pd.concat(local_dfs_reset, ignore_index=True, sort=False)

    # Fill missing data with NaN (the standard placeholder in pandas)
    return combined_data.fillna(np.nan)


# Example schema mappings for LAV transformations (adjust these to match your data)
schema_mapping_mongo = {
    '_id': 'ClientID',  # Renaming '_id' to 'ClientID' since '_id' is the actual column name
    'Name': 'Name',
    'Address': 'Address',
    'ContactInfo': 'ContactInfo',
    'ProgramName': 'ProgramName',
    'StartDate': 'StartDate',
    'EndDate': 'EndDate',
    'Status': 'Status',
    'PolicyName': 'PolicyName',
    # Renaming 'BillingAmount' to 'ProjectedSavings'
    'BillingAmount': 'ProjectedSavings',
    # Renaming 'LastInspectionDate' to 'PublicationDate'
    'LastInspectionDate': 'PublicationDate'
}

schema_mapping_postgres = {
    'ClientID': 'ClientID',
    'Name': 'Name',
    'Address': 'FullAddress',  # Renaming 'Address' to 'FullAddress'
    'ContactInfo': 'MobileInfo',  # Renaming 'ContactInfo' to 'MobileInfo'
    'ProgramName': 'WasteType',
    'StartDate': 'LastCollectionDate',  # Renaming 'StartDate' to 'LastCollectionDate'
    'EndDate': 'NextCollectionDate',  # Renaming 'EndDate' to 'NextCollectionDate'
    'Status': 'Status',
    'PolicyName': 'WasteType',  # Keeping 'WasteType' as 'PolicyName'
    'BillingAmount': 'TotalAmount',  # Renaming 'BillingAmount' to 'TotalAmount'
    # Renaming 'LastInspectionDate' to 'BillingDate'
    'LastInspectionDate': 'BillingDate'
}

# Updated schema mapping for Web dataset based on the column names you provided
schema_mapping_web = {
    'ReportID': 'ClientID',  # Mapping ReportID to ClientID
    'Name': 'SensorType',
    'Address': 'Location',
    'ContactInfo': 'ReportURL',
    'ProgramName': 'Temperature',  # Mapping 'ProgramName' to 'Temperature'
    'StartDate': 'Time',
    'EndDate': 'Time',
    'Status': 'Validation',
    'PolicyName': 'pH',  # Mapping 'PolicyName' to 'pH'
    'BillingAmount': 'Turbidity',  # Mapping 'BillingAmount' to 'Turbidity'
    'LastInspectionDate': 'Time'  # Mapping 'LastInspectionDate' to 'Time'
}

# Assuming the merged datasets have already been loaded as DataFrames
# Apply LAV transformation for Mongo, Postgres, and Web datasets
lav_mongo = lav_transform(merged_mongo, schema_mapping_mongo)
lav_postgres = lav_transform(merged_postgres, schema_mapping_postgres)
lav_web = lav_transform(merged_web, schema_mapping_web)


# Ensure lav_cassandra is defined and has a unique index
# Apply the same transformation logic to lav_cassandra as done with other datasets:
# Example (adjust schema_mapping for cassandra):
lav_cassandra = lav_transform(
    merged_cassandra, schema_mapping_cassandra)  # Assuming you have this

# Step 3: GAV Transformation Logic
local_dfs = [lav_cassandra, lav_mongo, lav_postgres,
             lav_web]  # Ensure lav_cassandra is defined too
gav_view = gav_transform(local_dfs)

# Display the GAV view's first few rows
print(gav_view.head())

                                            ClientID  \
0  0     ea614ade-9cee-43ba-bb90-319f7079f8dc
1  ...   
1                                                NaN   
2  0     1
1     1
2     1
3     1
4     1
5     ...   
3                                                NaN   

                                                Name  \
0  0       Customer_4
1       Customer_4
2       ...   
1  0      Customer_1
1      Customer_2
2      Cus...   
2  0     Customer_1
1     Customer_1
2     Custom...   
3  0       Turbidity
1       Turbidity
2       Tu...   

                                             Address  \
0  0       Rua 4, Faro
1       Rua 4, Faro
2     ...   
1  0      rua 1, Faro
1      rua 2, Faro
2      r...   
2  0     rua 1, Faro
1     rua 1, Faro
2     rua ...   
3  0     Reservoir_5
1     Reservoir_3
2     Rese...   

                                         ContactInfo  \
0  0     35191000004
1     35191000004
2     3519...   
1  0     35191000001
1     35191000002
2     

  return combined_data.fillna(np.nan)


In [30]:
def glav_transform(global_df, local_dfs):
    """
    Transforms data based on both global and local schemas.
    Global schema is mapped over local data.
    """
    # Merge the global schema with local data
    merged_data = pd.concat([global_df] + local_dfs,
                            ignore_index=True, sort=False)

    # Optional: If you don't want to fill missing values, just return the merged data
    return merged_data



# Apply GLAV transformation by merging global schema with local data
# Combine global and local data
glav_view = glav_transform(gav_view, local_dfs)

# Display the GLAV view's first few rows
print(glav_view.head())

                                            ClientID  \
0  0     ea614ade-9cee-43ba-bb90-319f7079f8dc
1  ...   
1                                                NaN   
2  0     1
1     1
2     1
3     1
4     1
5     ...   
3                                                NaN   
4  0     ea614ade-9cee-43ba-bb90-319f7079f8dc
1  ...   

                                                Name  \
0  0       Customer_4
1       Customer_4
2       ...   
1  0      Customer_1
1      Customer_2
2      Cus...   
2  0     Customer_1
1     Customer_1
2     Custom...   
3  0       Turbidity
1       Turbidity
2       Tu...   
4  0       Customer_4
1       Customer_4
2       ...   

                                             Address  \
0  0       Rua 4, Faro
1       Rua 4, Faro
2     ...   
1  0      rua 1, Faro
1      rua 2, Faro
2      r...   
2  0     rua 1, Faro
1     rua 1, Faro
2     rua ...   
3  0     Reservoir_5
1     Reservoir_3
2     Rese...   
4  0       Rua 4, Faro
1       Rua 4, Faro
2  

In [31]:
lav_cassandra.columns

Index(['ClientID', 'Name', 'Address', 'ContactInfo', 'ProgramName',
       'StartDate', 'EndDate', 'Status', 'PolicyName', 'BillingAmount',
       'LastInspectionDate'],
      dtype='object')

In [32]:
# Query 1: Retrieve all client information from LAV Cassandra
lav_cassandra[['ClientID', 'Name', 'Address', 'ContactInfo']].head()

Unnamed: 0,ClientID,Name,Address,ContactInfo
0,0 ea614ade-9cee-43ba-bb90-319f7079f8dc 1 ...,0 Customer_4 1 Customer_4 2 ...,"0 Rua 4, Faro 1 Rua 4, Faro 2 ...",0 35191000004 1 35191000004 2 3519...


In [33]:
# Query 2: Retrieve policies with start and end dates
lav_cassandra[['ClientID', 'PolicyName', 'StartDate', 'EndDate']].head()

Unnamed: 0,ClientID,PolicyName,StartDate,EndDate
0,0 ea614ade-9cee-43ba-bb90-319f7079f8dc 1 ...,0 Policy 3 1 Policy 1 2 Policy 2 3...,0 2024-01-26 1 2024-01-26 2 2024-01-2...,0 2026-01-25 1 2026-01-25 2 2026-01-2...


In [34]:
gav_view.columns

Index(['ClientID', 'Name', 'Address', 'ContactInfo', 'ProgramName',
       'StartDate', 'EndDate', 'Status', 'PolicyName', 'BillingAmount',
       'LastInspectionDate', '_id', 'ReportID'],
      dtype='object')

In [35]:
# Query 1: Retrieve all client information from GAV view
gav_view[['ClientID', 'Name', 'Address', 'ContactInfo']].head()

Unnamed: 0,ClientID,Name,Address,ContactInfo
0,0 ea614ade-9cee-43ba-bb90-319f7079f8dc 1 ...,0 Customer_4 1 Customer_4 2 ...,"0 Rua 4, Faro 1 Rua 4, Faro 2 ...",0 35191000004 1 35191000004 2 3519...
1,,0 Customer_1 1 Customer_2 2 Cus...,"0 rua 1, Faro 1 rua 2, Faro 2 r...",0 35191000001 1 35191000002 2 3519...
2,0 1 1 1 2 1 3 1 4 1 5 ...,0 Customer_1 1 Customer_1 2 Custom...,"0 rua 1, Faro 1 rua 1, Faro 2 rua ...",0 35191000001 1 35191000001 2 3519...
3,,0 Turbidity 1 Turbidity 2 Tu...,0 Reservoir_5 1 Reservoir_3 2 Rese...,0 report_1.pdf 1 report_2.pdf 2 ...


In [36]:
# Query 2: Retrieve policies with billing amount
gav_view[['PolicyName', 'BillingAmount']].head()

Unnamed: 0,PolicyName,BillingAmount
0,0 Policy 3 1 Policy 1 2 Policy 2 3...,0 100.0 1 100.0 2 100.0 3 100....
1,0 Policy_1 1 Policy_2 2 Policy_...,0 1.5 1 3.0 2 4.5 3 6.0 4 ...
2,0 Organic 1 Organic 2 Org...,0 5.0 1 5.0 2 5.0 3 5.0 4 ...
3,0 7.42 1 8.40 2 8.40 3 7.70 4 ...,0 5.30 1 5.68 2 5.68 3 2.67 4 ...


In [37]:
# Query 3: Retrieve start and end dates for programs
gav_view[['ProgramName', 'StartDate', 'EndDate']].head()

Unnamed: 0,ProgramName,StartDate,EndDate
0,0 Policy 3 1 Policy 1 2 Policy 2 3...,0 2024-01-26 1 2024-01-26 2 2024-01-2...,0 2026-01-25 1 2026-01-25 2 2026-01-2...
1,0 Program_1 1 Program_2 2 Progr...,0 2023-01-04 1 2023-01-07 2 2023-0...,0 2025-01-31 1 2025-03-02 2 2025-0...
2,0 Organic 1 Organic 2 Org...,0 2025-01-24 1 2025-01-24 2 2025-01-2...,0 2025-01-25 1 2025-01-25 2 2025-01-2...
3,0 26.86 1 10.25 2 10.25 3 34.9...,0 2024-01-15 1 2024-08-27 2 2024-08-2...,0 2024-01-15 1 2024-08-27 2 2024-08-2...


In [38]:
# Query 1: Retrieve clients with policy details and program names from GLAV view
glav_view[['ClientID', 'Name', 'PolicyName', 'ProgramName']].head()

Unnamed: 0,ClientID,Name,PolicyName,ProgramName
0,0 ea614ade-9cee-43ba-bb90-319f7079f8dc 1 ...,0 Customer_4 1 Customer_4 2 ...,0 Policy 3 1 Policy 1 2 Policy 2 3...,0 Policy 3 1 Policy 1 2 Policy 2 3...
1,,0 Customer_1 1 Customer_2 2 Cus...,0 Policy_1 1 Policy_2 2 Policy_...,0 Program_1 1 Program_2 2 Progr...
2,0 1 1 1 2 1 3 1 4 1 5 ...,0 Customer_1 1 Customer_1 2 Custom...,0 Organic 1 Organic 2 Org...,0 Organic 1 Organic 2 Org...
3,,0 Turbidity 1 Turbidity 2 Tu...,0 7.42 1 8.40 2 8.40 3 7.70 4 ...,0 26.86 1 10.25 2 10.25 3 34.9...
4,0 ea614ade-9cee-43ba-bb90-319f7079f8dc 1 ...,0 Customer_4 1 Customer_4 2 ...,0 Policy 3 1 Policy 1 2 Policy 2 3...,0 Policy 3 1 Policy 1 2 Policy 2 3...


In [39]:
glav_view.columns

Index(['ClientID', 'Name', 'Address', 'ContactInfo', 'ProgramName',
       'StartDate', 'EndDate', 'Status', 'PolicyName', 'BillingAmount',
       'LastInspectionDate', '_id', 'ReportID'],
      dtype='object')

In [40]:
# Query 2: Retrieve clients with billing and Policy name
glav_view[['ClientID', 'BillingAmount', 'PolicyName']].head()

Unnamed: 0,ClientID,BillingAmount,PolicyName
0,0 ea614ade-9cee-43ba-bb90-319f7079f8dc 1 ...,0 100.0 1 100.0 2 100.0 3 100....,0 Policy 3 1 Policy 1 2 Policy 2 3...
1,,0 1.5 1 3.0 2 4.5 3 6.0 4 ...,0 Policy_1 1 Policy_2 2 Policy_...
2,0 1 1 1 2 1 3 1 4 1 5 ...,0 5.0 1 5.0 2 5.0 3 5.0 4 ...,0 Organic 1 Organic 2 Org...
3,,0 5.30 1 5.68 2 5.68 3 2.67 4 ...,0 7.42 1 8.40 2 8.40 3 7.70 4 ...
4,0 ea614ade-9cee-43ba-bb90-319f7079f8dc 1 ...,0 100.0 1 100.0 2 100.0 3 100....,0 Policy 3 1 Policy 1 2 Policy 2 3...


In [41]:
# Query 3: Retrieve clients with last inspection date
glav_view[['ClientID', 'LastInspectionDate']].head()

Unnamed: 0,ClientID,LastInspectionDate
0,0 ea614ade-9cee-43ba-bb90-319f7079f8dc 1 ...,0 2025-01-25 17:03:24.432 1 2025-01-25 1...
1,,0 2022-06-01 1 2022-11-01 2 Na...
2,0 1 1 1 2 1 3 1 4 1 5 ...,0 2025-01-24 1 2025-01-24 2 2025-01-2...
3,,0 2024-01-15 1 2024-08-27 2 2024-08-2...
4,0 ea614ade-9cee-43ba-bb90-319f7079f8dc 1 ...,0 2025-01-25 17:03:24.432 1 2025-01-25 1...
