# Import Dependencies

In [48]:
import numpy as np
import pandas as pd

# Convert Multiple Excel sheets to CSV

In [49]:
excel_file = '../data/Moonshot Tracker Results - Auto.xlsx'
all_sheets = pd.read_excel(excel_file, sheet_name=None)
sheets = all_sheets.keys()

for sheet_name in sheets:
    sheet = pd.read_excel(excel_file, sheet_name=sheet_name)
    sheet.to_csv(f"../data/{sheet_name}.csv", index=False)

In [50]:
projects_raw = pd.read_csv("../data/Projects.csv")
outputs = pd.read_csv("../data/Outputs.csv")

# Data Relationship

![image info](../data/data_relationship.jpeg)

In [51]:
projects_raw.columns

Index(['Project ID', 'Project Title', 'Budget', 'Country', 'Country Code',
       'Link', 'Donors', 'VF or Non-VF', 'Output Count',
       'Direct Beneficiaries', 'GHG Emissions Reduction', 'description',
       'sdgs', 'solution', 'Gender Marker'],
      dtype='object')

In [52]:
project_target_column  = ['Project ID', 'Project Title', 'Budget', 'Donors', 'VF or Non-VF', 'Output Count',
       'Direct Beneficiaries', 'description']

In [53]:
projects_raw = projects_raw[project_target_column]

In [54]:
outputs.columns

Index(['Project ID', 'Title', 'Link', 'Budget', 'Beneficiary Category',
       'Indicator', 'Baseline', 'Target', 'Notes', 'Donors',
       'Gender (% female)', 'VF or Non-VF', 'Tag', 'SEH Taxonomy',
       'RISE Taxonomy', 'Flagship', 'Technology', 'Output ID', 'Country Code',
       'Category', 'name 2', 'm49', 'continent-region', 'sub-region',
       'sids-region', 'un-member', 'undp-sids', 'un-region', 'Country Name',
       'Region', 'Economy', 'LDC', 'SIDS', 'LLDC', 'HDI', 'Status',
       'Direct Conversion Factor', 'Direct Beneficiaries', 'Output Category',
       'Beneficiary Category Pre', 'Description'],
      dtype='object')

In [55]:
outputs_target_column  = ['Project ID', 'Title','Beneficiary Category','Gender (% female)', 'continent-region', 'Country Name',
       'Region', 'Direct Beneficiaries', 'Output Category', 'Description']

In [56]:
outputs = outputs[outputs_target_column]

# Functions for Extracting Information
Our Goal: Generate text summaries of the UNDP portfolio for each country

Approach: 
1. Assign 'continent-region', 'Country Name', and 'Region' to the project dataframe.
2. Create a function to get each 'project' for the specified 'continent-region', 'Country Name', or 'Region'.
3. Create a function to get the 'output's of each 'project'.
4. Create a function to call the OpenAI API and generate a summary for the UNDP portfolio for each country using a suitable prompt.

## 1. Assign 'continent-region', 'Country Name', and 'Region' to the project dataframe.

In [57]:
output_country = outputs[['Project ID', 'continent-region', 'Country Name','Region']]

In [58]:
output_country = output_country.drop_duplicates().reset_index(drop=True)

In [61]:
projects = pd.merge(projects_raw, output_country, on = 'Project ID')

In [63]:
projects.head()

Unnamed: 0,Project ID,Project Title,Budget,Donors,VF or Non-VF,Output Count,Direct Beneficiaries,description,continent-region,Country Name,Region
0,117913,Résilience des communautés et des écosystèmes,3459600.0,['UNITED NATIONS DEVELOPMENT PRO'],Non-VF,3,3697.5,Il s’agit de la promotion du développement Loc...,Africa,Togo,RBA
1,134793,Accès aux énergies renouvelables en milieu rur...,1431552.0,['UNITED NATIONS DEVELOPMENT PRO'],Non-VF,3,31525.0,Accélération de l’accès aux énergies renouvela...,Africa,Togo,RBA
2,91204,Apoyo a la Modernización de La Gestión Ambiental,4202031.0,UNDP (TRAC 4000) CLIMATE PROMISE (28708) - NDC...,Non-VF,1,0.0,,Americas,Panama,RBLAC
3,133871,Beyond Recovery COVID19 Energy,473000.0,['UNITED NATIONS DEVELOPMENT PRO'],Non-VF,2,17500.0,"Acceso universal, a través de la implementació...",Americas,Panama,RBLAC
4,6613,Africa Mini-grids Program,1363947.0,,VF,3,4936.0,,Africa,Zambia,RBA
