In [19]:
import time
import json
import base64
import datetime
import requests
import threading
import pandas as pd
from tqdm import tqdm
from pandas import json_normalize


class demographicsDataPrecisely:
    def __init__(self, client_id, client_secret):
        self.client_id = client_id
        self.client_secret = client_secret
        self.auth_token = None
        self.token_expiry_time = None
        self.url = "https://api.cloud.precisely.com/data-graph/graphql"
        self.refresh_token()

    def generate_auth_token(self):
        url = "https://api.cloud.precisely.com/auth/v2/token"
        payload = 'grant_type=client_credentials&scope=default'
        auth_string = f"{self.client_id}:{self.client_secret}"
        encoded_auth = base64.b64encode(auth_string.encode()).decode()
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded',
            'Authorization': f'Basic {encoded_auth}'
        }
        response = requests.request("POST", url, headers=headers, data=payload)
        if response.status_code == 200:
            return json.loads(response.text)['access_token']
        else:
            raise Exception(f"Token generation failed with status code {response.status_code}: {response.text}")

    def refresh_token(self):
        print("Generating new auth token...")
        self.auth_token = self.generate_auth_token()
        self.token_expiry_time = datetime.datetime.now() + datetime.timedelta(minutes=59)
        self.headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.auth_token}"
        }
        print("New token generated and set.")

    def check_token_expiry(self):
        if datetime.datetime.now() >= self.token_expiry_time:
            print("Auth token expired. Regenerating...")
            time.sleep(10)  # Sleep for 10 seconds
            self.refresh_token()
            print("Token refreshed. Resuming operations...")

    def process_dataframe(self, df):
        results = []
        for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Precisely IDs"):
            self.check_token_expiry()
            
            precisely_id = row['PBKEY']

            psyte_query = self.generate_psyteGeodemographics_query(precisely_id)
            psyte_response = self.get_response(psyte_query)
            
            coastal_risk_query = self.generate_coastalRisk_query(precisely_id)
            coastal_risk_response = self.get_response(coastal_risk_query)
            
            flood_risk_query = self.generate_floodRisk_query(precisely_id)
            flood_risk_response = self.get_response(flood_risk_query)
            
            results.append({
                "precisely_id": precisely_id,
                "psyte_response": psyte_response,
                "coastal_risk_response": coastal_risk_response,
                "flood_risk_response": flood_risk_response
            })
        return results

    def generate_psyteGeodemographics_query(self, precisely_id):
        # GraphQL query for psyteGeodemographics (same as before)
        return f"""
        query addressByPreciselyID {{
          getById(id: "{precisely_id}", queryType: PRECISELY_ID) {{
            addresses {{
              data {{
                psyteGeodemographics {{
                  data {{
                    PSYTECategoryCode
                    PSYTEGroupCode
                    PSYTESegmentCode {{
                      description
                    }}
                    censusBlock
                    censusBlockGroup
                    censusBlockPopulation
                    censusBlockHouseholds
                    householdIncomeVariable {{
                      value
                      description
                    }}
                    propertyValueVariable {{
                      value
                      description
                    }}
                    propertyTenureVariable {{
                      value
                      description
                    }}
                    propertyTypeVariable {{
                      value
                      description
                    }}
                    urbanRuralVariable {{
                      value
                      description
                    }}
                  }}
                }}
              }}
            }}
          }}
        }}
        """

    def generate_coastalRisk_query(self, precisely_id):
        # GraphQL query for coastalRisk (same as before)
        return f"""
        query coastalRisk {{
            getById(id: "{precisely_id}", queryType: PRECISELY_ID) {{
                addresses {{
                    data {{
                        coastalRisk {{
                            data {{
                                preciselyID
                                waterbodyName
                                nearestWaterbodyCounty
                                nearestWaterbodyState
                                nearestWaterbodyType {{
                                    value
                                    description
                                }}
                                nearestWaterbodyAdjacentName
                                nearestWaterbodyAdjacentType
                                distanceToNearestCoastFeet
                                windpoolDescription
                            }}
                        }}
                    }}
                }}
            }}
        }}
        """

    def generate_floodRisk_query(self, precisely_id):
        # GraphQL query for floodRisk (same as before)
        return f"""
        query floodRisk {{
          getById(id: "{precisely_id}", queryType: PRECISELY_ID) {{
            addresses {{
              data {{
                floodRisk {{
                  data {{
                    preciselyID
                    floodID
                    femaMapPanelIdentifier
                    floodZoneMapType
                    stateFIPS
                    floodZoneBaseFloodElevationFeet
                    floodZone
                    additionalInformation
                    baseFloodElevationFeet
                    communityNumber
                    communityStatus
                    mapEffectiveDate
                    letterOfMapRevisionDate
                    letterOfMapRevisionCaseNumber
                    floodHazardBoundaryMapInitialDate
                    floodInsuranceRateMapInitialDate
                    addressLocationElevationFeet
                    year100FloodZoneDistanceFeet
                    year500FloodZoneDistanceFeet
                    elevationProfileToClosestWaterbodyFeet
                    distanceToNearestWaterbodyFeet
                    nameOfNearestWaterbody
                  }}
                }}
              }}
            }}
          }}
        }}
        """

    def get_response(self, query):
        payload = {"query": query}
        response = requests.post(self.url, json=payload, headers=self.headers)

        if response.status_code == 200:
            return response.json()
        else:
            raise Exception(f"Query failed with status code {response.status_code}: {response.text}")
        

class dataProcessorForDemographics:
    def __init__(self, results):
        self.results = results
        self.combined_df = None

    def extract_data(self, result):
        precisely_id = result['precisely_id']
        psyte_data = result['psyte_response']['data']['getById']['addresses']['data'][0]['psyteGeodemographics']['data']
        coastal_data = result['coastal_risk_response']['data']['getById']['addresses']['data'][0]['coastalRisk']['data']
        flood_data = result['flood_risk_response']['data']['getById']['addresses']['data'][0]['floodRisk']['data']
        return precisely_id, psyte_data, coastal_data, flood_data

    def flatten_and_prefix(self, data, prefix):
        flat_data = json_normalize(data)
        return flat_data.add_prefix(f'{prefix}_')

    def process_single_result(self, result):
        precisely_id, psyte, coastal, flood = self.extract_data(result)
        psyte_flat = self.flatten_and_prefix(psyte, 'psyte')
        coastal_flat = self.flatten_and_prefix(coastal, 'coastal')
        flood_flat = self.flatten_and_prefix(flood, 'flood') 
        combined_row = pd.concat([psyte_flat, coastal_flat, flood_flat], axis=1)
        combined_row['precisely_id'] = precisely_id
        
        return combined_row

    def create_combined_dataframe(self):
        combined_data = [self.process_single_result(result) for result in self.results]
        self.combined_df = pd.concat(combined_data, ignore_index=True)
        cols = self.combined_df.columns.tolist()
        cols = ['precisely_id'] + [col for col in cols if col != 'precisely_id']
        self.combined_df = self.combined_df[cols]

    def get_dataframe(self):
        if self.combined_df is None:
            self.create_combined_dataframe()
        return self.combined_df

    def save_to_csv(self, filename='combined_precisely_data.csv'):
        if self.combined_df is None:
            self.create_combined_dataframe()
        self.combined_df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")

    def print_info(self):
        if self.combined_df is None:
            self.create_combined_dataframe()
        print("\nDataframe Info:")
        self.combined_df.info()
        print("\nColumn Names:")
        print(self.combined_df.columns.tolist())    


class propertyDataPrecisely:
    def __init__(self, client_id, client_secret, sample_percentage=100):
        """
        Initialize the API class with client credentials and sample percentage.
        """
        self.client_id = client_id
        self.client_secret = client_secret
        self.auth_token = self.get_new_token()
        self.token_expiry_time = time.time() + (59 * 60)  
        self.url = "https://api.cloud.precisely.com/data-graph/graphql/"
        self.sample_percentage = sample_percentage

        self.auto_refresh_token()

    def get_new_token(self):
        """
        Retrieve a new authentication token from Precisely API.
        """
        url = "https://api.cloud.precisely.com/auth/v2/token"
        payload = 'grant_type=client_credentials&scope=default'
        auth_string = f"{self.client_id}:{self.client_secret}"
        encoded_auth = base64.b64encode(auth_string.encode()).decode()
        
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded',
            'Authorization': f'Basic {encoded_auth}'
        }

        response = requests.post(url, headers=headers, data=payload)
        if response.status_code == 200:
            token_data = response.json()
            print("New auth token retrieved.")
            return token_data['access_token']
        else:
            raise Exception(f"Failed to retrieve token: {response.status_code}, {response.text}")

    def refresh_token(self):
        """
        Refresh the authentication token after 59 minutes and pause for 10 seconds.
        """
        print("Refreshing authentication token...")
        self.auth_token = self.get_new_token()
        self.token_expiry_time = time.time() + (59 * 60)
        print("New token has been released.")

    def auto_refresh_token(self):
        """
        Auto-refresh the token every 59 minutes in the background.
        """
        def refresh_loop():
            while True:
                time_remaining = self.token_expiry_time - time.time()
                if time_remaining < 60:  
                    time.sleep(10)
                    self.refresh_token() 
                time.sleep(60)

        threading.Thread(target=refresh_loop, daemon=True).start()

    def fetch_data(self, query):
        """
        Function to fetch data from API with error handling and retries.
        """
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.auth_token}"
        }
        for attempt in range(3): 
            response = requests.post(self.url, json={"query": query}, headers=headers)
            if response.status_code == 200:
                try:
                    return response.json()
                except ValueError as e:
                    print(f"Error parsing JSON: {e}")
            elif response.status_code == 401:
                print("Authentication expired. Refreshing token.")
                self.refresh_token() 
            else:
                print(f"Error: {response.status_code}, {response.text}")
            time.sleep(2)
        return None

    @staticmethod
    def safe_get(data, *keys):
        """
        Safely retrieve nested keys or return None.
        """
        for key in keys:
            if data is None:
                return None
            if isinstance(data, (dict, list)) and key in data:
                data = data[key]
            elif isinstance(data, list) and isinstance(key, int) and key < len(data):
                data = data[key]
            else:
                return None
        return data

    def get_data(self, query, *path):
        """
        Generic function to retrieve data using a GraphQL query.
        """
        response = self.fetch_data(query)
        return self.safe_get(response, *path)

    def build_address(self, row):
        """
        Format the address consistently from the DataFrame row.
        """
        return f"{row['ADD_NUMBER']} {row['STREETNAME']}, {row['CITY']}, {row['STATE']} {row['ZIPCODE']}"

    def enhance_data(self, df):
        """
        Enhance the DataFrame by fetching additional data from the Precisely API.
        """
        new_columns = [
            "LivingSquareFootage", "BedroomCount", "BathroomCount", "SaleAmount",
            "ParcelID", "ParcelArea", "Elevation", "Geometry", 
            "BuildingID", "MaxElevation", "MinElevation", "BuildingArea"
        ]
        for col in new_columns:
            df[col] = None  

        for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Enhancing Data"):
            address = self.build_address(row)

            property_data = self.get_property_data(address)
            if property_data:
                df.loc[index, "LivingSquareFootage"] = property_data.get("livingSquareFootage")
                df.loc[index, "BedroomCount"] = property_data.get("bedroomCount")
                df.loc[index, "BathroomCount"] = self.safe_get(property_data, "bathroomCount", "value")
                df.loc[index, "SaleAmount"] = property_data.get("saleAmount")

            parcel_data = self.get_parcel_data(address)
            if parcel_data:
                df.loc[index, "ParcelID"] = parcel_data.get("parcelID")
                df.loc[index, "ParcelArea"] = parcel_data.get("parcelArea")
                df.loc[index, "Elevation"] = parcel_data.get("elevation")
                df.loc[index, "Geometry"] = parcel_data.get("geometry")

            building_data = self.get_building_data(address)
            if building_data:
                df.loc[index, "BuildingID"] = building_data.get("buildingID")
                df.loc[index, "MaxElevation"] = building_data.get("maximumElevation")
                df.loc[index, "MinElevation"] = building_data.get("minimumElevation")
                df.loc[index, "BuildingArea"] = building_data.get("buildingArea")

        return df

    def get_property_data(self, address):
        query = f"""
        query {{
            getByAddress(address: "{address}") {{
                propertyAttributes {{
                    data {{
                        livingSquareFootage
                        bedroomCount
                        bathroomCount {{
                            value
                        }}
                        saleAmount
                    }}
                }}
            }}
        }}
        """
        return self.get_data(query, "data", "getByAddress", "propertyAttributes", "data", 0)

    def get_parcel_data(self, address):
        query = f"""
        query {{
            getByAddress(address: "{address}") {{
                parcels {{
                    data {{
                        parcelID
                        parcelArea
                        elevation
                        geometry
                    }}
                }}
            }}
        }}
        """
        return self.get_data(query, "data", "getByAddress", "parcels", "data", 0)

    def get_building_data(self, address):
        query = f"""
        query {{
            getByAddress(address: "{address}") {{
                buildings {{
                    data {{
                        buildingID
                        maximumElevation
                        minimumElevation
                        buildingArea
                    }}
                }}
            }}
        }}
        """
        return self.get_data(query, "data", "getByAddress", "buildings", "data", 0)

    def sample_data(self, df):
        """
        Sample a percentage of the DataFrame.
        """
        sample_size = int(len(df) * (self.sample_percentage / 100)) 
        return df.sample(n=sample_size, random_state=42)

### to extract data from flood risk, coast risk and psyte GeoDemographics datasets

In [None]:
if __name__ == "__main__":
    client_id = input("Enter your client ID: ")
    client_secret = input("Enter your client secret: ")
    # df = pd.read_csv('../data/filtered_data.csv')[:20000] ### Rishika
    # df = pd.read_csv('../data/filtered_data.csv')[20001:40000] ### Vaishali
    # df = pd.read_csv('../data/filtered_data.csv')[4000:60000] ### Mirudula
    # df = pd.read_csv('../data/filtered_data.csv')[60000:75000] ### Manish
    # df = pd.read_csv('../data/filtered_data.csv')[75000:] ### Govardhan
    precisely_api = demographicsDataPrecisely(client_id, client_secret)
    results = precisely_api.process_dataframe(df[:])
    print("Processing complete. Results:")
    processor = dataProcessorForDemographics(results)
    combined_df = processor.get_dataframe()
    print("Combined Dataframe:")
    print(combined_df.head())
    processor.save_to_csv()
    processor.print_info()

### to extract data from building, parcel, property dataset API

In [None]:
if __name__ == "__main__":
    client_id = input("Enter your client ID: ")
    client_secret = input("Enter your client secret: ")
    # df = pd.read_csv('../data/filtered_data.csv')[:20000] ### Rishika
    # df = pd.read_csv('../data/filtered_data.csv')[20001:40000] ### Vaishali
    # df = pd.read_csv('../data/filtered_data.csv')[4000:60000] ### Mirudula
    # df = pd.read_csv('../data/filtered_data.csv')[60000:75000] ### Manish
    # df = pd.read_csv('../data/filtered_data.csv')[75000:] ### Govardhan
    sample_percentage = int(input("Enter the percentage of data to sample (1-100): "))
    api = propertyDataPrecisely(client_id, client_secret, sample_percentage)
    sampled_df = api.sample_data(df)
    enhanced_df = api.enhance_data(sampled_df)
    enhanced_df.to_csv("./data/enriched_data.csv", index=False)
    print("Data enrichment completed and saved to enhanced_data.csv.")