In [None]:
  import pandas as pd
  from openpyxl import load_workbook
  import os
  from zipfile import BadZipFile  # Import the error class

  # USA SECTORS MAPPING
  category_mapping = {
      'Agriculture': [
          'Agriculture, forestry, fishing and hunting',
      ],
      'Industry': [
          'Mining, quarrying, and oil and gas extraction',
          'Utilities',
          'Construction',
          'Manufacturing',
      ],
      'Services': [
          'Trade',
          'Information',
          'Transportation and warehousing',
          'Finance, insurance, real estate, rental, and leasing',
          'Professional and business services',
          'Educational services, health care, and social assistance',
          'Arts, entertainment, recreation, accommodation, and food services',
          'Government and government enterprises',
          'Other services (except government and government enterprises)'
      ]
  }

  # Function to classify sectors
  def classify_sector(description):
      for category, keywords in category_mapping.items():
          for keyword in keywords:
              if keyword in description:
                  return category
      return None  # Return None if no category is matched

  # Path to the input directory
  input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

  # Iterate through files in the input directory
  for filename in os.listdir(input_directory):
      if 'Usa' in filename and filename.endswith('.xlsx'):
          input_file_path = os.path.join(input_directory, filename)

          try:
              # Try to read the Excel file
              df = pd.read_excel(input_file_path, engine='openpyxl')

              # Ensure the DataFrame contains the expected columns
              if 'Description' in df.columns and 'GDP' in df.columns:
                  # Convert the GDP column to numeric, coerce errors to NaN
                  df['GDP'] = pd.to_numeric(df['GDP'], errors='coerce')

                  # Initialize a dictionary for aggregating GDP by category
                  aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

                  # Process each row
                  for _, row in df.iterrows():
                      description = row['Description']
                      gdp = row['GDP']
                      category = classify_sector(description)

                      # If a category was found and GDP is not NaN, add the GDP to the respective category
                      if category and not pd.isna(gdp):
                          aggregation[category] += gdp

                  # Prepare the final DataFrame
                  final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

                  # Calculate the total GDP
                  total_gdp = final_df['Aggregated GDP'].sum()

                  # Create a DataFrame for the total row
                  total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

                  # Concatenate the total row to the final DataFrame
                  final_df = pd.concat([final_df, total_row], ignore_index=True)

                  # Calculate the proportion of total GDP for each category
                  final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

                  # Load the existing workbook to check for the existing sheet
                  workbook = load_workbook(input_file_path)

                  # Check if the sheet already exists
                  if 'Aggregated GDP' in workbook.sheetnames:
                      # Remove the existing sheet
                      del workbook['Aggregated GDP']

                  # Write the results to a new tab in the original Excel file
                  with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                      final_df.to_excel(writer, sheet_name='Aggregated Data', index=False)

                  print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
              else:
                  print(f"Error: The required columns 'Description' and 'GDP' are not present in the file {filename}.")

          except BadZipFile:
              print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
          except Exception as e:
              print(f"Skipped file {filename}: An error occurred: {e}")


Aggregation complete for Elbert Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Mason Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Chicot Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Marin Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Amherst Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Moffat Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Hawaii Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Iberville Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Amite Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Harrison Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Montrose Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Wright Usa.xlsx and saved to a new tab in the Excel file.

In [None]:
import pandas as pd
from openpyxl import load_workbook
import os
from zipfile import BadZipFile  # Import the error class

# BRAZIL SECTORS MAPPING
category_mapping = {
    'Agriculture': [
        'Agriculture, including support for agriculture and post-harvest activities',
        'Livestock, including support for livestock',
        'Forestry, fishing, and aquaculture'
    ],
    'Industry': [
        'Extractive industries',
        'Manufacturing industries',
        'Electricity, gas, water, sewage, waste management, and remediation activities',
        'Construction'
    ],
    'Services': [
        'Wholesale and retail trade; repair of motor vehicles and motorcycles',
        'Transportation, storage, and postal services',
        'Accommodation and food services',
        'Information and communication',
        'Financial and insurance activities and related services',
        'Real estate activities',
        'Professional, scientific, and technical activities; administrative and support services',
        'Public administration, defense, education, and public health and social security',
        'Private education and health services',
        'Arts, culture, sports, and recreation and other service activities',
        'Domestic services'
    ]
}

# Function to classify sectors
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None  # Return None if no category is matched

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    # Check if "Bra" or "BRA" exists as a standalone term in the filename (case-insensitive)
    if 'bra' in filename.lower().split() and filename.endswith('.xlsx'):
        input_file_path = os.path.join(input_directory, filename)

        try:
            # Try to read the Excel file
            df = pd.read_excel(input_file_path, engine='openpyxl')

            # Ensure the DataFrame contains the expected columns
            if df.columns[0] == 'Description' and df.columns[1] == 'GDP':
                # Convert the GDP column to numeric, coerce errors to NaN
                df['GDP'] = pd.to_numeric(df['GDP'], errors='coerce')

                # Initialize a dictionary for aggregating GDP by category
                aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

                # Process each row
                for _, row in df.iterrows():
                    description = row['Description']
                    gdp = row['GDP']
                    category = classify_sector(description)

                    # If a category was found and GDP is not NaN, add the GDP to the respective category
                    if category and not pd.isna(gdp):
                        aggregation[category] += gdp

                # Prepare the final DataFrame
                final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

                # Calculate the total GDP
                total_gdp = final_df['Aggregated GDP'].sum()

                # Create a DataFrame for the total row
                total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

                # Concatenate the total row to the final DataFrame
                final_df = pd.concat([final_df, total_row], ignore_index=True)

                # Calculate the proportion of total GDP for each category
                final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

                # Load the existing workbook to check for the existing sheet
                workbook = load_workbook(input_file_path)

                # Check if the sheet already exists
                if 'Aggregated GDP' in workbook.sheetnames:
                    # Remove the existing sheet
                    del workbook['Aggregated GDP']

                # Write the results to a new tab in the original Excel file
                with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                    final_df.to_excel(writer, sheet_name='Aggregated Data', index=False)

                print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
            else:
                print(f"Error: The required columns 'Description' and 'GDP' are not present in the file {filename}.")

        except BadZipFile:
            print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
        except Exception as e:
            print(f"Skipped file {filename}: An error occurred: {e}")


In [None]:
import pandas as pd
from openpyxl import load_workbook
import os
from zipfile import BadZipFile  # Import the error class
import re  # Import the regex module

# BRAZIL SECTORS MAPPING
category_mapping = {
    'Agriculture': [
        'Agriculture, including support for agriculture and post-harvest activities',
        'Livestock, including support for livestock',
        'Forestry, fishing, and aquaculture'
    ],
    'Industry': [
        'Extractive industries',
        'Manufacturing industries',
        'Electricity, gas, water, sewage, waste management, and remediation activities',
        'Construction'
    ],
    'Services': [
        'Wholesale and retail trade; repair of motor vehicles and motorcycles',
        'Transportation, storage, and postal services',
        'Accommodation and food services',
        'Information and communication',
        'Financial and insurance activities and related services',
        'Real estate activities',
        'Professional, scientific, and technical activities; administrative and support services',
        'Public administration, defense, education, and public health and social security',
        'Private education and health services',
        'Arts, culture, sports, and recreation and other service activities',
        'Domestic services'
    ]
}

# Function to classify sectors
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None  # Return None if no category is matched

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    # Use regex to match "Bra" or "BRA" as a standalone term in the filename
    if re.search(r'\b[Bb][Rr][Aa]\b', filename) and filename.endswith('.xlsx'):
        input_file_path = os.path.join(input_directory, filename)

        try:
            # Try to read the Excel file
            df = pd.read_excel(input_file_path, engine='openpyxl')

            # Ensure the DataFrame contains the expected columns
            if df.columns[0] == 'Sector' and df.columns[1] == 'Units':
                # Convert the GDP column to numeric, coerce errors to NaN
                df['Units'] = pd.to_numeric(df['Units'], errors='coerce')

                # Initialize a dictionary for aggregating GDP by category
                aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

                # Process each row
                for _, row in df.iterrows():
                    description = row['Sector']
                    gdp = row['Units']
                    category = classify_sector(description)

                    # If a category was found and GDP is not NaN, add the GDP to the respective category
                    if category and not pd.isna(gdp):
                        aggregation[category] += gdp

                # Prepare the final DataFrame
                final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

                # Calculate the total GDP
                total_gdp = final_df['Aggregated GDP'].sum()

                # Create a DataFrame for the total row
                total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

                # Concatenate the total row to the final DataFrame
                final_df = pd.concat([final_df, total_row], ignore_index=True)

                # Calculate the proportion of total GDP for each category
                final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

                # Load the existing workbook to check for the existing sheet
                workbook = load_workbook(input_file_path)

                # Check if the sheet already exists
                if 'Aggregated GDP' in workbook.sheetnames:
                    # Remove the existing sheet
                    del workbook['Aggregated GDP']

                # Write the results to a new tab in the original Excel file
                with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                    final_df.to_excel(writer, sheet_name='Aggregated Data', index=False)

                print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
            else:
                print(f"Error: The required columns 'Description' and 'GDP' are not present in the file {filename}.")

        except BadZipFile:
            print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
        except Exception as e:
            print(f"Skipped file {filename}: An error occurred: {e}")


Error: The required columns 'Description' and 'GDP' are not present in the file Baiao Bra.xlsx.
Error: The required columns 'Description' and 'GDP' are not present in the file Virginia Bra.xlsx.
Error: The required columns 'Description' and 'GDP' are not present in the file Seara Bra.xlsx.
Aggregation complete for Sonora Bra.xlsx and saved to a new tab in the Excel file.
Error: The required columns 'Description' and 'GDP' are not present in the file Rondon Bra.xlsx.
Error: The required columns 'Description' and 'GDP' are not present in the file Distrito Federal BRA.xlsx.
Error: The required columns 'Description' and 'GDP' are not present in the file Rio Do Oeste Bra.xlsx.
Error: The required columns 'Description' and 'GDP' are not present in the file Parnaiba Bra.xlsx.
Error: The required columns 'Description' and 'GDP' are not present in the file Minas Gerais BRA.xlsx.
Error: The required columns 'Description' and 'GDP' are not present in the file Parana Bra.xlsx.
Error: The required 

In [None]:
import pandas as pd
from openpyxl import load_workbook
import os
from zipfile import BadZipFile
import re

# Updated NORWAY SECTORS MAPPING with new sectors added
category_mapping = {
    'Agriculture': [
        'Agriculture and forestry',
        'Fishing and aquaculture',
        'Agriculture, including support for agriculture and post-harvest activities',
        'Livestock, including support for livestock',
        'Forestry, fishing, and aquaculture'
    ],
    'Industry': [
        'Service activities incidental to oil and gas',
        'Manufacturing',
        'Food products, beverages and tobacco',
        'Textiles, wearing apparel, leather',
        'Wood, wood products and paper products',
        'Printing and reproduction of recorded media',
        'Refined petroleum, chemical and pharmaceutical products',
        'Rubber, plastic and mineral products',
        'Basic metals',
        'Machinery and other equipment n.e.c',
        'Building of ships, oil platforms and moduls and other transport equipment',
        'Water supply, sewerage, waste',
        'Construction',
        'Extractive industries',
        'Mining and quarrying',
        'Oil and gas extraction including services',
        'Oil and gas extraction',
        'Electricity, gas and steam'
    ],
    'Services': [
        'Wholesale and retail trade, repair of motor vehicles',
        'Transport via pipelines',
        'Ocean transport',
        'Transport activities excl. ocean transport',
        'Postal and courier activities',
        'Accommodation and food service activities',
        'Information and communication',
        'Financial and insurance activities',
        'Real estate activities',
        'Imputed rents of owner-occupied dwellings',
        'Professional, scientific and and technical activities',
        'Administrative and support service activities',
        'Public administration and defence',
        'Education',
        'Health and social work',
        'Arts, entertainment and other service activities',
        'Transportation, storage, and postal services',
        'Accommodation and food services',
        'Public administration, defense, education, and public health and social security',
        'Private education and health services',
        'Arts, culture, sports, and recreation and other service activities',
        'Domestic services'
    ]
}

# Function to classify sectors
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    # Use regex to match "Nor" or "NOR" as a standalone term in the filename
    if re.search(r'\b[Nn][Oo][Rr]\b', filename) and filename.endswith('.xlsx'):
        input_file_path = os.path.join(input_directory, filename)

        try:
            # Read the Excel file without headers, starting directly at row 2
            df = pd.read_excel(input_file_path, engine='openpyxl', header=None, skiprows=1)

            # Inspect the shape of the dataframe to understand its structure
            print(f"File: {filename} - Shape of DataFrame: {df.shape}")

            # Ensure we only select the first two columns
            df = df.iloc[:, [0, 1]]  # Select the first two columns (Sector and Units)

            # Rename columns to 'Sector' and 'Units' for easier access
            df.columns = ['Sector', 'Units']

            # Convert the Units column to numeric, coerce errors to NaN
            df['Units'] = pd.to_numeric(df['Units'], errors='coerce')

            # Initialize a dictionary for aggregating GDP by category
            aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

            # Process each row
            for _, row in df.iterrows():
                description = row['Sector']
                gdp = row['Units']
                category = classify_sector(description)

                # If a category was found and GDP is not NaN, add the GDP to the respective category
                if category and not pd.isna(gdp):
                    aggregation[category] += gdp

            # Prepare the final DataFrame
            final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

            # Calculate the total GDP
            total_gdp = final_df['Aggregated GDP'].sum()

            # Create a DataFrame for the total row
            total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

            # Concatenate the total row to the final DataFrame
            final_df = pd.concat([final_df, total_row], ignore_index=True)

            # Calculate the proportion of total GDP for each category
            final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

            # Load the existing workbook to check for the existing sheet
            workbook = load_workbook(input_file_path)

            # Check if the sheet already exists
            if 'Aggregated GDP' in workbook.sheetnames:
                # Remove the existing sheet
                del workbook['Aggregated GDP']

            # Write the results to a new tab in the original Excel file
            with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                final_df.to_excel(writer, sheet_name='Aggregated Data', index=False)

            print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
        except BadZipFile:
            print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
        except Exception as e:
            print(f"Skipped file {filename}: An error occurred: {e}")


File: Svalbard NOR.xlsx - Shape of DataFrame: (41, 19)
Aggregation complete for Svalbard NOR.xlsx and saved to a new tab in the Excel file.
File: Vestfold og Telemark NOR.xlsx - Shape of DataFrame: (33, 19)
Aggregation complete for Vestfold og Telemark NOR.xlsx and saved to a new tab in the Excel file.
File: Froland Nor Nor.xlsx - Shape of DataFrame: (41, 19)
Aggregation complete for Froland Nor Nor.xlsx and saved to a new tab in the Excel file.
File: Møre og Romsdal NOR.xlsx - Shape of DataFrame: (41, 19)
Aggregation complete for Møre og Romsdal NOR.xlsx and saved to a new tab in the Excel file.
File: Iveland Nor Nor.xlsx - Shape of DataFrame: (41, 19)
Aggregation complete for Iveland Nor Nor.xlsx and saved to a new tab in the Excel file.
File: Oslo Nor Nor.xlsx - Shape of DataFrame: (36, 19)
Aggregation complete for Oslo Nor Nor.xlsx and saved to a new tab in the Excel file.
File: Agder NOR.xlsx - Shape of DataFrame: (38, 19)
Aggregation complete for Agder NOR.xlsx and saved to a new

In [2]:
#Germany

import pandas as pd
from openpyxl import load_workbook
import os
from zipfile import BadZipFile
import re

# Updated sector mappings with new categories for Agriculture, Industry, Services
category_mapping = {
    'Agriculture': [
        'Agriculture, forestry & fishing'
    ],
    'Industry': [
        'Industry',
        'Mining & quarrying',
        'Manufacturing',
        'Construction',
        'Electricity, water, waste'
    ],
    'Services': [
        'Trade, transportation, food',
        'Real estate',
        'Public administration',
        'Health & social work',
        'Administrative & support',
        'Education',
        'Information, communication, finance',
        'Other services',
        'Professional, scientific & technical'
    ]
}

# Function to classify sectors based on the updated mapping
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    # Use regex to match "NOR" or "DEU" in the filename
    if re.search(r'\b(DEU|Deu)\b', filename) and filename.endswith('.xlsx'):
        input_file_path = os.path.join(input_directory, filename)

        try:
            # Read the Excel file without headers, starting directly at row 2
            df = pd.read_excel(input_file_path, engine='openpyxl', header=None, skiprows=1)

            # Print the shape of the dataframe for debugging purposes
            print(f"File: {filename} - Shape of DataFrame: {df.shape}")

            # Ensure we only select the first two columns
            df = df.iloc[:, [0, 1]]  # Select the first two columns (Sector and Units)

            # Rename columns to 'Sector' and 'Units' for easier access
            df.columns = ['Sector', 'Units']

            # Convert the Units column to numeric, coerce errors to NaN
            df['Units'] = pd.to_numeric(df['Units'], errors='coerce')

            # Initialize a dictionary for aggregating GDP by category
            aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

            # Process each row
            for _, row in df.iterrows():
                description = row['Sector']
                gdp = row['Units']
                category = classify_sector(description)

                # If a category was found and GDP is not NaN, add the GDP to the respective category
                if category and not pd.isna(gdp):
                    aggregation[category] += gdp

            # Prepare the final DataFrame
            final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

            # Calculate the total GDP
            total_gdp = final_df['Aggregated GDP'].sum()

            # Create a DataFrame for the total row
            total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

            # Concatenate the total row to the final DataFrame
            final_df = pd.concat([final_df, total_row], ignore_index=True)

            # Calculate the proportion of total GDP for each category
            final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

            # Load the existing workbook to check for the existing sheet
            workbook = load_workbook(input_file_path)

            # Check if the sheet already exists
            if 'Aggregated Data' in workbook.sheetnames:
                # Remove the existing sheet
                del workbook['Aggregated Data']

            # Write the results to a new tab in the original Excel file
            with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                # Write the DataFrame starting from cell A2
                final_df.to_excel(writer, sheet_name='Aggregated Data', index=False, startrow=1)

            print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
        except BadZipFile:
            print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
        except Exception as e:
            print(f"Skipped file {filename}: An error occurred: {e}")


File: Saarland Deu.xlsx - Shape of DataFrame: (8, 19)
Skipped file Saarland Deu.xlsx: An error occurred: Sheet 'Aggregated Data' already exists and if_sheet_exists is set to 'error'.
File: Hamburg Deu.xlsx - Shape of DataFrame: (18, 19)
Skipped file Hamburg Deu.xlsx: An error occurred: argument of type 'float' is not iterable
File: Brandenburg Deu.xlsx - Shape of DataFrame: (14, 19)
Aggregation complete for Brandenburg Deu.xlsx and saved to a new tab in the Excel file.
File: Sachsen Anhalt Deu.xlsx - Shape of DataFrame: (11, 19)
Skipped file Sachsen Anhalt Deu.xlsx: An error occurred: argument of type 'float' is not iterable
File: Saxony-Anhalt DEU.xlsx - Shape of DataFrame: (17, 18)
Skipped file Saxony-Anhalt DEU.xlsx: An error occurred: argument of type 'float' is not iterable
File: North Rhine-Westphalia DEU.xlsx - Shape of DataFrame: (16, 18)
Aggregation complete for North Rhine-Westphalia DEU.xlsx and saved to a new tab in the Excel file.
File: Berlin Slv DEU.xlsx - Shape of DataF

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Units'] = pd.to_numeric(df['Units'], errors='coerce')


File: Hessen DEU.xlsx - Shape of DataFrame: (21, 18)
Skipped file Hessen DEU.xlsx: An error occurred: argument of type 'float' is not iterable
File: Saxony DEU.xlsx - Shape of DataFrame: (7, 18)
Aggregation complete for Saxony DEU.xlsx and saved to a new tab in the Excel file.
File: Lower Saxony DEU.xlsx - Shape of DataFrame: (16, 18)
Aggregation complete for Lower Saxony DEU.xlsx and saved to a new tab in the Excel file.
File: Schleswig Holstein Deu.xlsx - Shape of DataFrame: (11, 19)
Aggregation complete for Schleswig Holstein Deu.xlsx and saved to a new tab in the Excel file.
File: Mecklenburg-Western DEU.xlsx - Shape of DataFrame: (21, 18)
Skipped file Mecklenburg-Western DEU.xlsx: An error occurred: argument of type 'float' is not iterable
File: Mecklenburg Vorpommern Deu.xlsx - Shape of DataFrame: (12, 19)
Skipped file Mecklenburg Vorpommern Deu.xlsx: An error occurred: argument of type 'float' is not iterable


In [None]:
import pandas as pd
from openpyxl import load_workbook
import os
from zipfile import BadZipFile
import re

# Updated category mapping based on new sectors
category_mapping = {
    'Agriculture': [
        'Agriculture, Forestry, and Fishing'
    ],
    'Industry': [
        'Mining and Quarrying, Manufacturing, Electricity, Gas, Steam, Air Conditioning and Water Supply, Sewerage, Waste Management, and Remediation Activities',
        'Construction'
    ],
    'Services': [
        'Wholesale and Retail Trade, Repair of Motor Vehicles and Motorcycles, Transportation and Storage, Accommodation, and Food Service Activities',
        'Information and Communication',
        'Financial and Insurance Activities',
        'Real Estate Activities',
        'Professional, Scientific, and Technical Activities, Administrative, and Support Service Activities',
        'Public Administration and Defence, Compulsory Social Security, Education, Human Health, and Social Work Activities',
        'Arts, Entertainment, Recreation, Other Service Activities, Activities of Households as Employers, Undifferentiated Goods and Services Producing Activities of Households for Own Use, Activities of Extraterritorial Organisations and Bodies'
    ]
}

# Function to classify sectors
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    # Use regex to match "GRC" or "Grc" as a standalone term in the filename
    if re.search(r'\b[Gg][Rr][Cc]\b', filename) and filename.endswith('.xlsx'):
        input_file_path = os.path.join(input_directory, filename)

        try:
            # Read the Excel file without skipping any rows
            df = pd.read_excel(input_file_path, engine='openpyxl', header=None)

            # Inspect the shape of the dataframe to understand its structure
            print(f"File: {filename} - Shape of DataFrame: {df.shape}")

            # Ensure we only select the first two columns starting from row 1 (row 2 in the file)
            df = df.iloc[1:, [0, 1]]  # Select the first two columns (Sector and Units), skipping the first row

            # Rename columns to 'Sector' and 'Units' for easier access
            df.columns = ['Sector', 'Units']

            # Convert the Units column to numeric, coerce errors to NaN
            df['Units'] = pd.to_numeric(df['Units'], errors='coerce')

            # Initialize a dictionary for aggregating GDP by category
            aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

            # Process each row
            for _, row in df.iterrows():
                description = row['Sector']
                gdp = row['Units']
                category = classify_sector(description)

                # If a category was found and GDP is not NaN, add the GDP to the respective category
                if category and not pd.isna(gdp):
                    aggregation[category] += gdp

            # Prepare the final DataFrame
            final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP Data'])

            # Calculate the total GDP
            total_gdp = final_df['Aggregated GDP Data'].sum()

            # Create a DataFrame for the total row
            total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP Data': [total_gdp]})

            # Concatenate the total row to the final DataFrame
            final_df = pd.concat([final_df, total_row], ignore_index=True)

            # Calculate the proportion of total GDP for each category
            final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP Data'] / total_gdp) * 100

            # Load the existing workbook to check for the existing sheet
            workbook = load_workbook(input_file_path)

            # Check if the sheet already exists
            if 'Aggregated GDP' in workbook.sheetnames:
                # Remove the existing sheet
                del workbook['Aggregated GDP']

            # Write the results to a new tab in the original Excel file
            with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                final_df.to_excel(writer, sheet_name='Aggregated GDP Data', index=False)

            print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
        except BadZipFile:
            print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
        except Exception as e:
            print(f"Skipped file {filename}: An error occurred: {e}")


File: Thessalias Grc.xlsx - Shape of DataFrame: (12, 19)
Aggregation complete for Thessalias Grc.xlsx and saved to a new tab in the Excel file.
File: Peloponnisoy Grc.xlsx - Shape of DataFrame: (12, 19)
Aggregation complete for Peloponnisoy Grc.xlsx and saved to a new tab in the Excel file.
File: Western Macedonia GRC.xlsx - Shape of DataFrame: (13, 19)
Aggregation complete for Western Macedonia GRC.xlsx and saved to a new tab in the Excel file.
File: Southern Aegean GRC.xlsx - Shape of DataFrame: (12, 19)
Aggregation complete for Southern Aegean GRC.xlsx and saved to a new tab in the Excel file.
File: Stereas Elladas Grc.xlsx - Shape of DataFrame: (12, 19)
Aggregation complete for Stereas Elladas Grc.xlsx and saved to a new tab in the Excel file.
File: Aegean Islands, Crete GRC.xlsx - Shape of DataFrame: (12, 19)
Aggregation complete for Aegean Islands, Crete GRC.xlsx and saved to a new tab in the Excel file.
File: Attica GRC.xlsx - Shape of DataFrame: (12, 19)
Aggregation complete fo

In [None]:
import pandas as pd
from openpyxl import load_workbook
import os
from zipfile import BadZipFile
import re

# CANADA SECTORS MAPPING
category_mapping = {
    'Agriculture': [
        'Agriculture, forestry, fishing and hunting'
    ],
    'Industry': [
        'Manufacturing',
        'Construction',
        'Utilities',
        'Mining, quarrying, and oil and gas extraction'
    ],
    'Services': [
        'Real estate and rental and leasing',
        'Public administration',
        'Health care and social assistance',
        'Retail trade',
        'Educational services',
        'Finance and insurance',
        'Transportation and warehousing',
        'Wholesale trade',
        'Professional, scientific and technical services',
        'Information and cultural industries',
        'Administrative and support, waste management and remediation services',
        'Other services (except public administration)',
        'Accommodation and food services',
        'Arts, entertainment and recreation',
        'Management of companies and enterprises'
    ]
}

# Function to classify sectors
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    if re.search(r'\bCAN\b', filename, re.IGNORECASE) and filename.endswith('.xlsx'):
        input_file_path = os.path.join(input_directory, filename)

        try:
            # Read the first two columns of the Excel file without headers
            df = pd.read_excel(input_file_path, engine='openpyxl', header=None, usecols=[0, 1])

            # Rename columns for clarity
            df.columns = ['Sector', 'GDP']

            # Convert the GDP column to numeric, coercing errors to NaN
            df['GDP'] = pd.to_numeric(df['GDP'], errors='coerce')

            # Initialize a dictionary for aggregating GDP by category
            aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

            # Process each row
            for _, row in df.iterrows():
                description = row['Sector']
                gdp = row['GDP']
                category = classify_sector(description)

                # If a category was found and GDP is not NaN, add the GDP to the respective category
                if category and not pd.isna(gdp):
                    aggregation[category] += gdp

            # Prepare the final DataFrame
            final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

            # Calculate the total GDP
            total_gdp = final_df['Aggregated GDP'].sum()

            # Create a DataFrame for the total row
            total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

            # Concatenate the total row to the final DataFrame
            final_df = pd.concat([final_df, total_row], ignore_index=True)

            # Calculate the proportion of total GDP for each category
            final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

            # Load the existing workbook to check for the existing sheet
            workbook = load_workbook(input_file_path)

            # Check if the sheet already exists
            if 'Aggregated Data' in workbook.sheetnames:
                del workbook['Aggregated Data']

            # Write the results to a new tab in the original Excel file
            with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                final_df.to_excel(writer, sheet_name='Aggregated Data', index=False)

            print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")

        except BadZipFile:
            print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
        except Exception as e:
            print(f"Skipped file {filename}: An error occurred: {e}")


Aggregation complete for Southeast Sud Est CAN.xlsx and saved to a new tab in the Excel file.
Skipped file Northwest Territories Terri CAN.xlsx: An error occurred: argument of type 'numpy.float64' is not iterable
Skipped file Can Tur.xlsx: An error occurred: Sheet 'Aggregated Data' already exists and if_sheet_exists is set to 'error'.
Skipped file Ontario CAN.xlsx: An error occurred: argument of type 'float' is not iterable
Aggregation complete for New Brunswick CAN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Nova Scotia CAN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Quebec CAN.xlsx and saved to a new tab in the Excel file.
Skipped file Yukon CAN.xlsx: An error occurred: argument of type 'float' is not iterable
Skipped file Alberta CAN.xlsx: An error occurred: argument of type 'numpy.float64' is not iterable
Aggregation complete for British Columbia CAN.xlsx and saved to a new tab in the Excel file.
Skipped file Sakatchewan CAN.x

In [None]:
import pandas as pd
from openpyxl import load_workbook
import os
from zipfile import BadZipFile  # Import the error class
import re  # Import the regex module

# CHILE SECTORS MAPPING
category_mapping = {
    'Agriculture': [
        'Agriculture/livestock and forestry',
        'Fishery'
    ],
    'Industry': [
        'Mining',
        'Manufacturing industry',
        'Electricity, gas, water and waste management',
        'Construction'
    ],
    'Services': [
        'Wholesale and retail trade',
        'Hotels and restaurants',
        'Transport, information and communications',
        'Financial and business services',
        'Dwelling services and real estate',
        'Personal services',
        'Public administration'
    ]
}

# Function to classify sectors
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None  # Return None if no category is matched

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    # Use regex to match "Chl" or "CHL" as a standalone term in the filename
    if re.search(r'\b[Cc][Hh][Ll]\b', filename) and filename.endswith('.xlsx'):
        input_file_path = os.path.join(input_directory, filename)

        try:
            # Try to read the Excel file
            df = pd.read_excel(input_file_path, engine='openpyxl')

            # Ensure the DataFrame contains the expected columns
            if df.columns[0] == 'Sector' and df.columns[1] == 'GDP':
                # Convert the GDP column to numeric, coerce errors to NaN
                df['GDP'] = pd.to_numeric(df['GDP'], errors='coerce')

                # Initialize a dictionary for aggregating GDP by category
                aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

                # Process each row
                for _, row in df.iterrows():
                    description = row['Sector']
                    gdp = row['GDP']
                    category = classify_sector(description)

                    # If a category was found and GDP is not NaN, add the GDP to the respective category
                    if category and not pd.isna(gdp):
                        aggregation[category] += gdp

                # Prepare the final DataFrame
                final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

                # Calculate the total GDP
                total_gdp = final_df['Aggregated GDP'].sum()

                # Create a DataFrame for the total row
                total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

                # Concatenate the total row to the final DataFrame
                final_df = pd.concat([final_df, total_row], ignore_index=True)

                # Calculate the proportion of total GDP for each category
                final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

                # Load the existing workbook to check for the existing sheet
                workbook = load_workbook(input_file_path)

                # Check if the sheet already exists
                if 'Aggregated GDP' in workbook.sheetnames:
                    # Remove the existing sheet
                    del workbook['Aggregated GDP']

                # Write the results to a new tab in the original Excel file
                with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                    final_df.to_excel(writer, sheet_name='Aggregated Data', index=False)

                print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
            else:
                print(f"Error: The required columns 'Sector' and 'GDP' are not present in the file {filename}.")

        except BadZipFile:
            print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
        except Exception as e:
            print(f"Skipped file {filename}: An error occurred: {e}")


Aggregation complete for Provincia De Antofagasta Chl.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Provincia De Parinacota Chl.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Atacama CHL.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Metropolitan Region CHL.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Maule CHL.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Magallanes and the Chilean Antarctic CHL.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Libertador General Bernardo O Higgins CHL.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Coquimbo CHL.xlsx and saved to a new tab in the Excel file.


In [None]:
import pandas as pd
from openpyxl import load_workbook
import os
from zipfile import BadZipFile
import re

# MEXICO SECTORS MAPPING
category_mapping = {
    'Agriculture': [
        'Agriculture, Animal Husbandry and Exploitation, Forestry, Fishing and Hunting'
    ],
    'Industry': [
        'Mining',
        'Generation, Transmission and Distribution of Electricity, Water and Gas Pipeline to the Consumer',
        'Building',
        'Manufacturing Industries'
    ],
    'Services': [
        'Wholesale Trade',
        'Retail Trade',
        'Transportation and Storage',
        'Mass Media Information',
        'Financial and Insurance Services',
        'Real Estate and Rental Services of Furniture and Intangible Assets',
        'Services Professionals, Scientists and Technicals',
        'Support Services to Business and Waste Management and Waste and Remediation Services',
        'Educational Services',
        'Health and Social Assistance Services',
        'Leisure and Cultural Services, Sports, and other Recreational Services',
        'Temporary Accommodation and Food Preparation and Drinks',
        'Other Services except Government Activities'
    ]
}

# Function to classify sectors
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None  # Return None if no category is matched

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    # Use regex to match "Mex" or "MEX" in the filename
    if re.search(r'\b[Mm][Ee][Xx]\b', filename) and filename.endswith('.xlsx'):
        input_file_path = os.path.join(input_directory, filename)

        try:
            # Read the Excel file without headers and select only the first two columns
            df = pd.read_excel(input_file_path, engine='openpyxl', header=None, usecols=[0, 1])

            # Rename columns for consistency
            df.columns = ['Sector', 'GDP']

            # Convert the GDP column to numeric, coerce errors to NaN
            df['GDP'] = pd.to_numeric(df['GDP'], errors='coerce')

            # Initialize a dictionary for aggregating GDP by category
            aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

            # Process each row
            for _, row in df.iterrows():
                description = row['Sector']
                gdp = row['GDP']
                category = classify_sector(description)

                # If a category was found and GDP is not NaN, add the GDP to the respective category
                if category and not pd.isna(gdp):
                    aggregation[category] += gdp

            # Prepare the final DataFrame
            final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

            # Calculate the total GDP
            total_gdp = final_df['Aggregated GDP'].sum()

            # Create a DataFrame for the total row
            total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

            # Concatenate the total row to the final DataFrame
            final_df = pd.concat([final_df, total_row], ignore_index=True)

            # Calculate the proportion of total GDP for each category
            final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

            # Load the existing workbook to check for the existing sheet
            workbook = load_workbook(input_file_path)

            # Check if the sheet already exists
            if 'Aggregated GDP' in workbook.sheetnames:
                # Remove the existing sheet
                del workbook['Aggregated GDP']

            # Write the results to a new tab in the original Excel file
            with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                final_df.to_excel(writer, sheet_name='Aggregated Data', index=False)

            print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
        except BadZipFile:
            print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
        except Exception as e:
            print(f"Skipped file {filename}: An error occurred: {e}")


Aggregation complete for Heroica Villa Tezoatlan De Segura Y Luna Cuna De La Independencia De Oaxaca Mex.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Ojuelos De Jalisco Mex.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Quintana Roo Mex.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Aguascalientes Mex.xlsx and saved to a new tab in the Excel file.
Skipped file Guanajuato Mex.xlsx: An error occurred: argument of type 'float' is not iterable
Aggregation complete for Chihuahua Mex.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Campeche Mex.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Queretaro Mex.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Tlaxcala Mex.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Zacatecas Mex.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Guerrero Mex.xlsx and save

In [None]:
import pandas as pd
from openpyxl import load_workbook
import os

# GBR SECTORS MAPPING
category_mapping = {
    'Agriculture': [
        'Agriculture, forestry and fishing'
    ],
    'Industry': [
        'Production sector',
    ],
    'Services': [
        'Services sector',
    ]
}

# Function to classify sectors
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None  # Return None if no category is matched

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    if 'Gbr' in filename and filename.endswith('.xlsx'):  # Check for 'Gbr' in filename
        input_file_path = os.path.join(input_directory, filename)

        # Read the Excel file
        df = pd.read_excel(input_file_path)

        # Ensure the DataFrame contains the expected columns 'description' and '2021'
        if 'description' in df.columns and '2021' in df.columns:
            # Convert the '2021' column to numeric, coerce errors to NaN
            df['2021'] = pd.to_numeric(df['2021'], errors='coerce')

            # Initialize a dictionary for aggregating GDP by category
            aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

            # Process each row
            for _, row in df.iterrows():
                description = row['description']
                gdp = row['2021']
                category = classify_sector(description)

                # If a category was found and GDP is not NaN, add the GDP to the respective category
                if category and not pd.isna(gdp):
                    aggregation[category] += gdp

            # Prepare the final DataFrame
            final_df = pd.DataFrame(list(aggregation.items()), columns=['description', '2021'])

            # Calculate the total GDP
            total_gdp = final_df['2021'].sum()

            # Create a DataFrame for the total row
            total_row = pd.DataFrame({'description': ['Total'], '2021': [total_gdp]})

            # Concatenate the total row to the final DataFrame
            final_df = pd.concat([final_df, total_row], ignore_index=True)

            # Calculate the proportion of total GDP for each category
            final_df['Proportion of Total GDP'] = (final_df['2021'] / total_gdp) * 100

            # Load the existing workbook to check for the existing sheets
            workbook = load_workbook(input_file_path)

            # Check if the "Aggregated GDP Final" sheet exists and remove it if found
            if 'Aggregated GDP Final' in workbook.sheetnames:
                del workbook['Aggregated GDP Final']

            # Write the results to a new tab in the original Excel file
            with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                final_df.to_excel(writer, sheet_name='Aggregated GDP', index=False)

            print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
        else:
            print(f"Error: The required columns 'description' and '2021' are not present in the file {filename}.")



Aggregation complete for North East Lincolnshire Gbr.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Bath and North East Somerset Gbr.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Nottingham Gbr.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Leicester Gbr.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Plymouth Gbr.xlsx and saved to a new tab in the Excel file.
Aggregation complete for York Gbr.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Rutland Gbr.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Blackpool Gbr.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Warrington Gbr.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Blackburn With Darwen Gbr.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Peterborough Gbr.xlsx and saved to a new tab in the Excel file.
Aggregation 

In [None]:
import pandas as pd
from openpyxl import load_workbook
import os

# RUS SECTORS MAPPING
category_mapping = {
    'Agriculture': [
        'Agriculture, forestry and fishing'
    ],
    'Industry': [
        'Mining and quarrying',
        'Manufacturing',
        'Construction',
        'Electricity, gas, steam and air conditioning supply',
        'Water supply; sewerage, waste management and remediation activities'
    ],
    'Services': [
        'Transportation and storage',
        'Wholesale and retail trade; repair of motor vehicles and motorcycles',
        'Public administration and defence; compulsory social security',
        'Human health and social work activities',
        'Education',
        'Real estate activities',
        'Information and communication',
        'Administrative and support service activities',
        'Professional, scientific and technical activities',
        'Accommodation and food service activities',
        'Arts, entertainment and recreation',
        'Other service activities',
        'Financial and insurance activities',
        'Activities of households as employers'
    ]
}

# Function to classify sectors
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None  # Return None if no category is matched

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    if 'Rus' in filename and filename.endswith('.xlsx'):  # Check for 'Rus' in filename
        input_file_path = os.path.join(input_directory, filename)

        # Read the Excel file without headers (since data has no headers)
        df = pd.read_excel(input_file_path, header=None)

        # Initialize a dictionary for aggregating GDP by category
        aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

        # Process rows in pairs: odd rows are descriptions, even rows are GDP values
        for i in range(0, len(df) - 1, 2):  # Iterate over rows two at a time
            description = df.iloc[i, 0]  # Sector name (odd row)
            gdp = pd.to_numeric(df.iloc[i + 1, 0], errors='coerce')  # GDP value (even row)

            # Classify sector and aggregate GDP if classification and GDP value are valid
            category = classify_sector(description)
            if category and not pd.isna(gdp):
                aggregation[category] += gdp

        # Prepare the final DataFrame
        final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

        # Calculate the total GDP
        total_gdp = final_df['Aggregated GDP'].sum()

        # Create a DataFrame for the total row
        total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

        # Concatenate the total row to the final DataFrame
        final_df = pd.concat([final_df, total_row], ignore_index=True)

        # Calculate the proportion of total GDP for each category
        final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

        # Load the existing workbook to check for the existing sheet
        workbook = load_workbook(input_file_path)

        # Check if the sheet already exists
        if 'Aggregated GDP' in workbook.sheetnames:
            # Remove the existing sheet
            del workbook['Aggregated GDP']

        # Write the results to a new tab in the original Excel file
        with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
            final_df.to_excel(writer, sheet_name='Aggregated GDP', index=False)

        print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")


Aggregation complete for Russell Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Rush Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Rusk Usa.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Vologda Rus.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Khabarovsk Rus.xlsx and saved to a new tab in the Excel file.
Aggregation complete for 55107_Rusk_WI_USA.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Gorodskoi Okrug Belgorod Rus.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Gorodskoi Okrug Kirov Rus.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Kursk Rus.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Gorodskoi Okrug Irkutsk Rus.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Gorodskoi Okrug Kaluga Rus.xlsx and saved to a new tab in the Excel file.
Aggregation c

In [None]:
import pandas as pd
from openpyxl import load_workbook
import os
import re  # Import the regular expressions module

# GBR SECTORS MAPPING
category_mapping = {
    'Agriculture': [
        'Agriculture, forestry and fishing'
    ],
    'Industry': [
        'Mining and quarrying',
        'Manufacturing',
        'Electricity, gas, steam and air conditioning supply',
        'Water supply, sewerage, waste management and remediation activities',
        'Construction'
    ],
    'Services': [
        'Wholesale and retail trade services, repair of vehicles and motorcycles',
        'Transportation and storage',
        'Accommodation and food service activities',
        'Information and communication',
        'Financial and insurance activities',
        'Real estate activities',
        'Professional, scientific and technical activities',
        'Administrative and support service activities',
        'Public administration and defence',
        'Compulsory social security',
        'Education',
        'Human health and social work activities',
        'Arts, entertainment and recreation',
        'Repair of household goods and other services'
    ]
}

# Function to classify sectors
def classify_sector(description):
    # Ensure description is a string; if NaN, return an empty string
    description = str(description) if pd.notna(description) else ''

    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None  # Return None if no category is matched

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    # Check for 'Ita' or 'ITA' as standalone words using regex
    if re.search(r'\b[Ii]ta\b', filename) and filename.endswith('.xlsx'):
        input_file_path = os.path.join(input_directory, filename)

        # Read the Excel file without headers
        df = pd.read_excel(input_file_path, header=None)

        # Ensure the DataFrame has at least 2 columns
        if df.shape[1] >= 2:
            # Select only the first two columns for processing
            df = df.iloc[:, :2]  # Keep only the first two columns
            df.columns = ['description', '2021']  # Rename columns for clarity

            # Convert the '2021' column to numeric, coercing errors to NaN
            df['2021'] = pd.to_numeric(df['2021'], errors='coerce')

            # Initialize a dictionary for aggregating GDP by category
            aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

            # Process each row
            for _, row in df.iterrows():
                description = row['description']
                gdp = row['2021']
                category = classify_sector(description)

                # If a category was found and GDP is not NaN, add the GDP to the respective category
                if category and not pd.isna(gdp):
                    aggregation[category] += gdp

            # Prepare the final DataFrame
            final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

            # Calculate the total GDP
            total_gdp = final_df['Aggregated GDP'].sum()

            # Create a DataFrame for the total row
            total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

            # Concatenate the total row to the final DataFrame
            final_df = pd.concat([final_df, total_row], ignore_index=True)

            # Calculate the proportion of total GDP for each category
            final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

            # Load the existing workbook to check for the existing sheet
            workbook = load_workbook(input_file_path)

            # Check if the sheet already exists
            if 'Aggregated GDP' in workbook.sheetnames:
                # Remove the existing sheet
                del workbook['Aggregated GDP']

            # Write the results to a new tab in the original Excel file
            with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                final_df.to_excel(writer, sheet_name='Aggregated GDP ITA', index=False)

            print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
        else:
            print(f"Error: The file {filename} does not have enough columns.")

Aggregation complete for Piemonte Ita.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Veneto Ita.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Abruzzo Ita.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Basilicata Ita.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Friuli Venezia Giulia Ita.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Emilia Romagna Ita.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Calabria Ita.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Campania Ita.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Puglia Ita.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Valle D'Aosta Ita.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Umbria Ita.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Lazio Ita.xlsx and sa

In [None]:
import pandas as pd
from openpyxl import load_workbook
import os
from zipfile import BadZipFile  # Import the error class

# IDN SECTORS MAPPING
category_mapping = {
    'Agriculture': [
        '1. Agriculture, Forestry and Fishery'
    ],
    'Industry': [
        '2. Mining and Quarrying',
        '3. Manufacturing',
        '4. Electricity and Gas Supply',
        '5. Water Supply, Sewerage, Waste Management and Remediation Activities',
        '6. Construction'
    ],
    'Services': [
        '7. Wholesale and Retail Trades, Repair of Motor Vehicles and Motorcycles',
        '8. Transport and Storage',
        '9. Accommodation and Food Service Activities',
        '10. Information and Communication',
        '11. Financial and Insurance Services',
        'a. Financial Intermediary Services',
        '12. Real Estate Activities',
        '13. Business Services',
        '14. Public Administration and Defence; Compulsory Social Security',
        '15. Education',
        '16. Human Health and Social Work Activities',
        '17. Other Services Activities'
    ]
}

# Function to classify sectors
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None  # Return None if no category is matched

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    if 'Idn' in filename and filename.endswith('.xlsx'):  # Filter for IDN files
        input_file_path = os.path.join(input_directory, filename)

        try:
            # Read the Excel file without headers, using column positions (first column for sectors, second for values)
            df = pd.read_excel(input_file_path, engine='openpyxl', header=None)

            # Check if the DataFrame has at least 2 columns (and ignore any extra columns)
            if df.shape[1] >= 2:
                # Restrict the DataFrame to the first two columns
                df = df.iloc[:, :2]  # Select only the first two columns
                df.columns = ['Description', 'GDP']  # Rename the columns

                # Convert the GDP column to numeric, coerce errors to NaN
                df['GDP'] = pd.to_numeric(df['GDP'], errors='coerce')

                # Initialize a dictionary for aggregating GDP by category
                aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

                # Process each row
                for _, row in df.iterrows():
                    description = row['Description']
                    gdp = row['GDP']
                    category = classify_sector(description)

                    # If a category was found and GDP is not NaN, add the GDP to the respective category
                    if category and not pd.isna(gdp):
                        aggregation[category] += gdp

                # Prepare the final DataFrame
                final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

                # Calculate the total GDP
                total_gdp = final_df['Aggregated GDP'].sum()

                # Create a DataFrame for the total row
                total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

                # Concatenate the total row to the final DataFrame
                final_df = pd.concat([final_df, total_row], ignore_index=True)

                # Calculate the proportion of total GDP for each category
                final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

                # Load the existing workbook to check for the existing sheet
                workbook = load_workbook(input_file_path)

                # Check if the sheet already exists
                if 'Aggregated GDP' in workbook.sheetnames:
                    # Remove the existing sheet
                    del workbook['Aggregated GDP']

                # Write the results to a new tab in the original Excel file
                with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                    final_df.to_excel(writer, sheet_name='Aggregated Data', index=False)

                print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
            else:
                print(f"Error: The file {filename} does not have the required two columns.")

        except BadZipFile:
            print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
        except Exception as e:
            print(f"Skipped file {filename}: An error occurred: {e}")


Aggregation complete for Gorontalo Idn.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Bangka Idn.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Maluku Barat Daya Idn.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Kota Jambi Idn.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Buru Idn.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Lampung Barat Idn.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Bengkulu Utara Idn.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Aceh Jaya Idn.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Bangli Idn.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Kapuas Idn.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Bantaeng Idn.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Berau Idn.xlsx and saved t

In [None]:
import pandas as pd
from openpyxl import load_workbook
import os
from zipfile import BadZipFile  # Import the error class

# CHN SECTORS MAPPING
category_mapping = {
    'Agriculture': [
        'Processing of food from agricultural products'
    ],
    'Industry': [
        'Manufacture of special purpose machinery',
        'Manufacture of non-metallic mineral products',
        'Manufacture of communication equipment, computer and other electronic equipment',
        'Manufacture of chemical raw material and chemical products',
        'Manufacture and processing of non-ferrous metals',
        'Manufacture of general purpose machinery',
        'Manufacture of electrical machinery and equipment',
        'Production and supply of electric power and heat power',
        'Manufacture of metal products'
    ],
    'Services': [
        # Add services mappings here if needed, currently none of your sectors fall under services
    ]
}

# Additional keywords to classify as 'Industry'
industry_keywords = ['Manufacture', 'Processing', 'Production', 'Smelting']

# Function to classify sectors
def classify_sector(description):
    description_lower = description.lower()  # Convert description to lowercase for case-insensitive matching

    # Handle the exception for "Processing of food from agricultural products" first
    if 'processing of food from agricultural products' in description_lower:
        return 'Agriculture'

    # Check if the description contains any of the industry keywords
    for keyword in industry_keywords:
        if keyword.lower() in description_lower:
            return 'Industry'

    # Fall back to the defined category mapping
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword.lower() in description_lower:
                return category
    return None  # Return None if no category is matched

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    if 'CHN' in filename and filename.endswith('.xlsx'):  # Filter for CHN files
        input_file_path = os.path.join(input_directory, filename)

        try:
            # Read the Excel file without headers, using column positions (first column for sectors, second for values)
            df = pd.read_excel(input_file_path, engine='openpyxl', header=None)

            # Check if the DataFrame has at least 2 columns (and ignore any extra columns)
            if df.shape[1] >= 2:
                # Restrict the DataFrame to the first two columns
                df = df.iloc[:, :2]  # Select only the first two columns
                df.columns = ['Description', 'GDP']  # Rename the columns

                # Convert the GDP column to numeric, coerce errors to NaN
                df['GDP'] = pd.to_numeric(df['GDP'], errors='coerce')

                # Initialize a dictionary for aggregating GDP by category
                aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

                # Process each row
                for _, row in df.iterrows():
                    description = row['Description']
                    gdp = row['GDP']
                    category = classify_sector(description)

                    # If a category was found and GDP is not NaN, add the GDP to the respective category
                    if category and not pd.isna(gdp):
                        aggregation[category] += gdp

                # Prepare the final DataFrame
                final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

                # Calculate the total GDP
                total_gdp = final_df['Aggregated GDP'].sum()

                # Create a DataFrame for the total row
                total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

                # Concatenate the total row to the final DataFrame
                final_df = pd.concat([final_df, total_row], ignore_index=True)

                # Calculate the proportion of total GDP for each category
                final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

                # Load the existing workbook to check for the existing sheet
                workbook = load_workbook(input_file_path)

                # Check if the sheet already exists
                if 'Aggregated GDP' in workbook.sheetnames:
                    # Remove the existing sheet
                    del workbook['Aggregated GDP']

                # Write the results to a new tab in the original Excel file
                with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                    final_df.to_excel(writer, sheet_name='Aggregated Data Final', index=False)

                print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
            else:
                print(f"Error: The file {filename} does not have the required two columns.")

        except BadZipFile:
            print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
        except Exception as e:
            print(f"Skipped file {filename}: An error occurred: {e}")


Aggregation complete for Jiangxi CHN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Gansu CHN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Guizhou CHN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Hebei CHN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Guangdong CHN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Hubei CHN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Hunan CHN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Jiangsu CHN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Inner Mongolia CHN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Shaanxi CHN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Jilin CHN.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Tibet CHN.xlsx and saved to a new tab in the Excel f

In [None]:
import pandas as pd
from openpyxl import load_workbook
import os
from zipfile import BadZipFile  # Import the error class

# IND SECTOR MAPPINGS
category_mapping = {
    'Agriculture': [
        'Crops',
        'Livestock',
        'Forestry & logging',
        'Fishing and aquaculture'
    ],
    'Industry': [
        'Mining & quarrying',
        'Manufacturing',
        'Electricity, gas, water supply & other utility services',
        'Construction'
    ],
    'Services': [
        'Trade, repair, hotels and restaurants',
        'Trade & repair services',
        'Hotels & restaurants',
        'Transport, storage, communication & services related to broadcasting',
        'Railways',
        'Road transport',
        'Water transport',
        'Air transport',
        'Services incidental to transport',
        'Storage',
        'Communication & services related to broadcasting',
        'Financial services',
        'Real estate, ownership of dwelling & professional services',
        'Public administration & defence',
        'Other services'
    ]
}

# Function to classify sectors
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            # Use exact match to avoid partial matches
            if keyword in description.split(', '):
                return category
    return None  # Return None if no category is matched

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    if 'IND' in filename and filename.endswith('.xlsx'):
        input_file_path = os.path.join(input_directory, filename)

        try:
            # Try to read the Excel file
            df = pd.read_excel(input_file_path, engine='openpyxl', header=None)

            # Extract sector names from column 2 starting from row 5
            sector_names = df.iloc[4:, 1].tolist()  # Column 2, row index 4 onwards
            # Extract values from column 3 starting from row 5
            values = pd.to_numeric(df.iloc[4:, 2], errors='coerce')  # Column 3, row index 4 onwards

            # Initialize a dictionary for aggregating GDP by category
            aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

            # Process each row
            for description, gdp in zip(sector_names, values):
                category = classify_sector(description)

                # If a category was found and GDP is not NaN, add the GDP to the respective category
                if category and not pd.isna(gdp):
                    aggregation[category] += gdp

            # Prepare the final DataFrame
            final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

            # Calculate the total GDP
            total_gdp = final_df['Aggregated GDP'].sum()

            # Create a DataFrame for the total row
            total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

            # Concatenate the total row to the final DataFrame
            final_df = pd.concat([final_df, total_row], ignore_index=True)

            # Calculate the proportion of total GDP for each category
            final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

            # Load the existing workbook
            workbook = load_workbook(input_file_path)

            # Check if the sheet already exists and remove it
            if 'Aggregated Data' in workbook.sheetnames:
                del workbook['Aggregated Data']

            # Write the results to a new tab in the original Excel file
            with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                final_df.to_excel(writer, sheet_name='Aggregated Data Updated', index=False)

            print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")

        except BadZipFile:
            print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
        except Exception as e:
            print(f"Skipped file {filename}: An error occurred: {e}")


Skipped file Andhra Pradesh IND.xlsx: An error occurred: 'float' object has no attribute 'split'
Aggregation complete for Tamil Nadu IND.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Delhi IND.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Andaman & Nicobar Islands IND.xlsx and saved to a new tab in the Excel file.
Skipped file Meghalaya IND.xlsx: An error occurred: 'float' object has no attribute 'split'
Aggregation complete for Punjab IND.xlsx and saved to a new tab in the Excel file.
Skipped file Mizoram IND.xlsx: An error occurred: 'float' object has no attribute 'split'
Aggregation complete for Odisha IND.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Nagaland IND.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Rajasthan IND.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Puducherry IND.xlsx and saved to a new tab in the Excel file.
Aggregation complete for S

In [None]:
import pandas as pd
from openpyxl import load_workbook
import os
from zipfile import BadZipFile  # Import the error class

# TUR Sector Mappings
category_mapping = {
    'Agriculture': [
        'Agriculture, forestry and fishing',
    ],
    'Industry': [
        'Industry',
        'Manufacturing',
        'Construction',
    ],
    'Services': [
        'Services',
        'Information and communication',
        'Financial and insurance activities',
        'Real estate activities',
        'Professional, administrative and support service activities',
        'Public administration, education, human health and social work activities',
        'Other service activities'
    ]
}

# Function to classify sectors
def classify_sector(description):
    for category, keywords in category_mapping.items():
        for keyword in keywords:
            if keyword in description:
                return category
    return None  # Return None if no category is matched

# Path to the input directory
input_directory = '/content/drive/MyDrive/Summer work/Sources for GDP Datasets'

# Iterate through files in the input directory
for filename in os.listdir(input_directory):
    if 'Tur' in filename and filename.endswith('.xlsx'):  # Updated to filter for 'Tur'
        input_file_path = os.path.join(input_directory, filename)

        try:
            # Try to read the Excel file without headers
            df = pd.read_excel(input_file_path, header=None, engine='openpyxl')

            # Ensure the DataFrame has enough rows and columns
            if df.shape[0] > 1 and df.shape[1] > 1:
                # Extract descriptions and GDP values from the DataFrame
                descriptions = df.iloc[1:, 0]  # Sector names (Column 1)
                gdp_values = pd.to_numeric(df.iloc[1:, 1], errors='coerce')  # Associated values (Column 2)

                # Initialize a dictionary for aggregating GDP by category
                aggregation = {'Agriculture': 0, 'Industry': 0, 'Services': 0}

                # Process each row
                for description, gdp in zip(descriptions, gdp_values):
                    category = classify_sector(description)

                    # If a category was found and GDP is not NaN, add the GDP to the respective category
                    if category and not pd.isna(gdp):
                        aggregation[category] += gdp

                # Prepare the final DataFrame
                final_df = pd.DataFrame(list(aggregation.items()), columns=['Category', 'Aggregated GDP'])

                # Calculate the total GDP
                total_gdp = final_df['Aggregated GDP'].sum()

                # Create a DataFrame for the total row
                total_row = pd.DataFrame({'Category': ['Total'], 'Aggregated GDP': [total_gdp]})

                # Concatenate the total row to the final DataFrame
                final_df = pd.concat([final_df, total_row], ignore_index=True)

                # Calculate the proportion of total GDP for each category
                final_df['Proportion of Total GDP'] = (final_df['Aggregated GDP'] / total_gdp) * 100

                # Load the existing workbook to check for the existing sheet
                workbook = load_workbook(input_file_path)

                # Check if the sheet already exists
                if 'Aggregated GDP' in workbook.sheetnames:
                    # Remove the existing sheet
                    del workbook['Aggregated GDP']

                # Write the results to a new tab in the original Excel file
                with pd.ExcelWriter(input_file_path, engine='openpyxl', mode='a') as writer:
                    final_df.to_excel(writer, sheet_name='Aggregated Data', index=False)

                print(f"Aggregation complete for {filename} and saved to a new tab in the Excel file.")
            else:
                print(f"Error: The required structure is not present in the file {filename}.")

        except BadZipFile:
            print(f"Skipped file {filename}: File is not a valid zip (Excel) file.")
        except Exception as e:
            print(f"Skipped file {filename}: An error occurred: {e}")


Aggregation complete for Zonguldak Merkez Tur.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Kastamonu Merkez Tur.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Aydincik Tur.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Karesi Tur.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Hayrat Tur.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Manyas Tur.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Kirikhan Tur.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Antakya Tur.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Kocarli Tur.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Izmit Tur.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Abana Tur.xlsx and saved to a new tab in the Excel file.
Aggregation complete for Agli Tur.xlsx and saved to a new tab i