In [5]:
import pandas as pd
import re

# Load datasets
nobel_data = pd.read_csv('nobel_prize_processed.csv')
country_data = pd.read_csv('gdp_by_country.csv')

# Ensure the 'birth_country' column is of string type
nobel_data['birth_country'] = nobel_data['birth_country'].astype(str)

# Function to standardize country names
def standardize_country_name(name):
    # If there are parentheses, extract the name inside them
    match = re.search(r'\((.*?)\)', name)
    if match:
        return match.group(1).strip()
    else:
        # Handle common discrepancies manually
        name = name.strip()
        if name == 'United States of America':
            return 'United States'
        if name == 'Scotland':
            return 'United Kingdom'
        if name == 'Northern Ireland':
            return 'United Kingdom'
        if name == 'Trinidad':
            return 'Trinidad and Tobago'
        if name == 'Czech Republic':
            return 'Czech Republic (Czechia)'
        if name == 'Guadeloupe Island':
            return 'France'
        if name == "People's Republic of China":
            return 'China'
        # Add more custom mappings if necessary
        return name

# Apply the standardization function to the 'birth_country' column
nobel_data['birth_country'] = nobel_data['birth_country'].apply(standardize_country_name)

# Process the Nobel Prize data to get counts per country and year range
nobel_data_long = pd.melt(nobel_data, id_vars=['year', 'birth_country'], 
                          value_vars=['Chemistry', 'Economics', 'Literature', 'Medicine', 'Peace', 'Physics'],
                          var_name='category', value_name='count')

nobel_data_long = nobel_data_long[nobel_data_long['count'] > 0]
nobel_data_long = nobel_data_long.groupby(['year', 'birth_country']).sum().reset_index()

# Merge the datasets
merged_data = pd.merge(nobel_data_long, country_data, left_on='birth_country', right_on='country', how='inner')

# Find Nobel Prize entries that do not match any country in the GDP data
unmatched_data = nobel_data_long[~nobel_data_long['birth_country'].isin(country_data['country'])]

# Save the merged dataset
merged_data.to_csv('merged_data.csv', index=False)

# Save the unmatched dataset
unmatched_data.to_csv('unmatched_data.csv', index=False)