In [2]:
import pandas as pd
from collections import Counter
import re

def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text)  
    words = text.split() 
    return words

def get_word_frequencies(text):
    words = preprocess_text(text)
    word_counts = Counter(words)
    return word_counts


In [4]:
df_gdp = pd.read_csv('GDP by Country - Sheet1.csv')  
df_population = pd.read_csv('Population by Country - Sheet1.csv') 

In [5]:
df_gdp.info

<bound method DataFrame.info of        #              Country GDP\n(nominal, 2022)    GDP\n(abbrev.)  \
0      1        United States  $25,462,700,000,000  $25.463 trillion   
1      2                China  $17,963,200,000,000  $17.963 trillion   
2      3                Japan   $4,231,140,000,000   $4.231 trillion   
3      4              Germany   $4,072,190,000,000   $4.072 trillion   
4      5                India   $3,385,090,000,000   $3.385 trillion   
..   ...                  ...                  ...               ...   
172  173  Sao Tome & Principe         $546,680,342      $547 million   
173  174           Micronesia         $427,094,119      $427 million   
174  175     Marshall Islands         $279,667,900      $280 million   
175  176             Kiribati         $223,352,943      $223 million   
176  177               Tuvalu          $60,349,391       $60 million   

    GDP\ngrowth Population\n(2022) GDP\nper capita Share of\nWorld GDP  
0         2.06%        341,534

In [6]:
df_population.info


<bound method DataFrame.info of        # Country (or dependency) Population\n(2024) Yearly\nChange  \
0      1                   India      1,450,935,791         0.89 %   
1      2                   China      1,419,321,278        -0.23 %   
2      3           United States        345,426,571         0.57 %   
3      4               Indonesia        283,487,931         0.82 %   
4      5                Pakistan        251,269,164         1.52 %   
..   ...                     ...                ...            ...   
229  230              Montserrat              4,389        -0.70 %   
230  231        Falkland Islands              3,470        -0.20 %   
231  232                 Tokelau              2,506         4.55 %   
232  233                    Niue              1,819         0.11 %   
233  234                Holy See                496         0.00 %   

    Net\nChange Density\n(P/Km²) Land Area\n(Km²) Migrants\n(net)  \
0    12,866,195              488        2,973,190        -

In [12]:
df_gdp.columns

Index(['#', 'Country', 'GDP\n(nominal, 2022)', 'GDP\n(abbrev.)', 'GDP\ngrowth',
       'Population\n(2022)', 'GDP\nper capita', 'Share of\nWorld GDP'],
      dtype='object')

In [13]:
df_population.columns

Index(['#', 'Country (or dependency)', 'Population\n(2024)', 'Yearly\nChange',
       'Net\nChange', 'Density\n(P/Km²)', 'Land Area\n(Km²)',
       'Migrants\n(net)', 'Fert.\nRate', 'Med.\nAge', 'Urban\nPop %',
       'World\nShare'],
      dtype='object')

In [14]:
df_population.rename(columns={'Country (or dependency)': 'Country'}, inplace=True)

In [15]:
df_population.columns

Index(['#', 'Country', 'Population\n(2024)', 'Yearly\nChange', 'Net\nChange',
       'Density\n(P/Km²)', 'Land Area\n(Km²)', 'Migrants\n(net)',
       'Fert.\nRate', 'Med.\nAge', 'Urban\nPop %', 'World\nShare'],
      dtype='object')

In [16]:
unique_gdp_countries_count = df_gdp['Country'].nunique()
print(f'Number of unique countries in df_gdp: {unique_gdp_countries_count}')

Number of unique countries in df_gdp: 177


In [17]:
unique_population_countries_count = df_population['Country'].nunique()
print(f'Number of unique countries in df_population: {unique_population_countries_count}')

Number of unique countries in df_population: 234


In [18]:
merged_df = pd.merge(df_gdp, df_population, on='Country', how='inner')

In [19]:
print(merged_df.head())

   #_x        Country GDP\n(nominal, 2022)    GDP\n(abbrev.) GDP\ngrowth  \
0    1  United States  $25,462,700,000,000  $25.463 trillion       2.06%   
1    2          China  $17,963,200,000,000  $17.963 trillion       2.99%   
2    3          Japan   $4,231,140,000,000   $4.231 trillion       1.03%   
3    4        Germany   $4,072,190,000,000   $4.072 trillion       1.79%   
4    5          India   $3,385,090,000,000   $3.385 trillion       7.00%   

  Population\n(2022) GDP\nper capita Share of\nWorld GDP  #_y  \
0        341,534,046         $74,554              25.32%    3   
1      1,425,179,569         $12,604              17.86%    2   
2        124,997,578         $33,850               4.21%   12   
3         84,086,227         $48,429               4.05%   19   
4      1,425,423,212          $2,375               3.37%    1   

  Population\n(2024) Yearly\nChange Net\nChange Density\n(P/Km²)  \
0        345,426,571         0.57 %   1,949,236               38   
1      1,419,321

In [20]:
merged_df.to_csv('gpd_population_merged.csv', index=False)

In [21]:
df_csv = pd.read_csv("gpd_population_merged.csv")
df_csv

Unnamed: 0,#_x,Country,"GDP\n(nominal, 2022)",GDP\n(abbrev.),GDP\ngrowth,Population\n(2022),GDP\nper capita,Share of\nWorld GDP,#_y,Population\n(2024),Yearly\nChange,Net\nChange,Density\n(P/Km²),Land Area\n(Km²),Migrants\n(net),Fert.\nRate,Med.\nAge,Urban\nPop %,World\nShare
0,1,United States,"$25,462,700,000,000",$25.463 trillion,2.06%,341534046,"$74,554",25.32%,3,345426571,0.57 %,1949236,38,9147420,1286132,1.6,38,82 %,4.23 %
1,2,China,"$17,963,200,000,000",$17.963 trillion,2.99%,1425179569,"$12,604",17.86%,2,1419321278,-0.23 %,-3263655,151,9388211,-318992,1.0,40,66 %,17.39 %
2,3,Japan,"$4,231,140,000,000",$4.231 trillion,1.03%,124997578,"$33,850",4.21%,12,123753041,-0.50 %,-617906,339,364555,153357,1.2,49,93 %,1.52 %
3,4,Germany,"$4,072,190,000,000",$4.072 trillion,1.79%,84086227,"$48,429",4.05%,19,84552242,0.00 %,4011,243,348560,36954,1.4,45,76 %,1.04 %
4,5,India,"$3,385,090,000,000",$3.385 trillion,7.00%,1425423212,"$2,375",3.37%,1,1450935791,0.89 %,12866195,488,2973190,-630830,2.0,28,37 %,17.78 %
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,173,Sao Tome & Principe,"$546,680,342",$547 million,0.93%,226305,"$2,416",0.00%,188,235536,2.02 %,4665,245,960,-604,3.6,19,78 %,0.00 %
173,174,Micronesia,"$427,094,119",$427 million,-0.62%,523477,$816,0.00%,174,526923,0.32 %,1683,753,700,-5076,2.8,26,75 %,0.01 %
174,175,Marshall Islands,"$279,667,900",$280 million,1.50%,40077,"$6,978",0.00%,217,37548,-3.29 %,-1279,209,180,-1765,2.9,21,N.A.,0.00 %
175,176,Kiribati,"$223,352,943",$223 million,1.56%,130469,"$1,712",0.00%,193,134518,1.50 %,1988,166,810,-471,3.1,23,57 %,0.00 %


In [22]:
df_csv.head(25)

Unnamed: 0,#_x,Country,"GDP\n(nominal, 2022)",GDP\n(abbrev.),GDP\ngrowth,Population\n(2022),GDP\nper capita,Share of\nWorld GDP,#_y,Population\n(2024),Yearly\nChange,Net\nChange,Density\n(P/Km²),Land Area\n(Km²),Migrants\n(net),Fert.\nRate,Med.\nAge,Urban\nPop %,World\nShare
0,1,United States,"$25,462,700,000,000",$25.463 trillion,2.06%,341534046,"$74,554",25.32%,3,345426571,0.57 %,1949236,38,9147420,1286132,1.6,38,82 %,4.23 %
1,2,China,"$17,963,200,000,000",$17.963 trillion,2.99%,1425179569,"$12,604",17.86%,2,1419321278,-0.23 %,-3263655,151,9388211,-318992,1.0,40,66 %,17.39 %
2,3,Japan,"$4,231,140,000,000",$4.231 trillion,1.03%,124997578,"$33,850",4.21%,12,123753041,-0.50 %,-617906,339,364555,153357,1.2,49,93 %,1.52 %
3,4,Germany,"$4,072,190,000,000",$4.072 trillion,1.79%,84086227,"$48,429",4.05%,19,84552242,0.00 %,4011,243,348560,36954,1.4,45,76 %,1.04 %
4,5,India,"$3,385,090,000,000",$3.385 trillion,7.00%,1425423212,"$2,375",3.37%,1,1450935791,0.89 %,12866195,488,2973190,-630830,2.0,28,37 %,17.78 %
5,6,United Kingdom,"$3,070,670,000,000",$3.071 trillion,4.10%,68179315,"$45,038",3.05%,21,69138192,0.66 %,455230,286,241930,417114,1.6,40,84 %,0.85 %
6,7,France,"$2,782,910,000,000",$2.783 trillion,2.56%,66277409,"$41,989",2.77%,23,66548530,0.17 %,109708,122,547557,90527,1.6,42,82 %,0.82 %
7,8,Russia,"$2,240,420,000,000",$2.240 trillion,-2.07%,145579899,"$15,390",2.23%,9,144820423,-0.43 %,-620077,9,16376870,-178042,1.5,40,75 %,1.77 %
8,9,Canada,"$2,139,840,000,000",$2.140 trillion,3.40%,38821259,"$55,120",2.13%,38,39742430,1.13 %,443325,4,9093510,368599,1.3,40,80 %,0.49 %
9,10,Italy,"$2,010,430,000,000",$2.010 trillion,3.67%,59619115,"$33,721",2.00%,25,59342867,-0.26 %,-156586,202,294140,95246,1.2,48,72 %,0.73 %


In [23]:
df_csv.columns


Index(['#_x', 'Country', 'GDP\n(nominal, 2022)', 'GDP\n(abbrev.)',
       'GDP\ngrowth', 'Population\n(2022)', 'GDP\nper capita',
       'Share of\nWorld GDP', '#_y', 'Population\n(2024)', 'Yearly\nChange',
       'Net\nChange', 'Density\n(P/Km²)', 'Land Area\n(Km²)',
       'Migrants\n(net)', 'Fert.\nRate', 'Med.\nAge', 'Urban\nPop %',
       'World\nShare'],
      dtype='object')