In [1]:
# import dependencies
import pandas as pd
import numpy as np

In [2]:
filter_years = [2008, 2013]
data_input_folder = "crop_yield_data/"
data_output_folder = "crop_yield_data_cleaned/"

In [3]:
#load in datasets
# Yields
yield_df = pd.read_csv(f"{data_input_folder}yield.csv")

# Average Temperature (C)
temp_df = pd.read_csv(f"{data_input_folder}temp.csv")
temp_df = temp_df.rename(columns = {'year':'Year', 'country':'Area', 'avg_temp': "Value"})

# Nutrients in fertilizer used for Agricultural Use
# Item Codes 3102 = Nitrogen, 3103 = Phosphate, 3104 = Potash
nutrients_df = pd.read_csv(f"{data_input_folder}nutrients_au.csv")

# Pesticides
pesticides_df = pd.read_csv(f"{data_input_folder}pesticides.csv")

# Land Use
# Item Codes 6610 = Agricultural Land, 6621 = Arable Land
land_df = pd.read_csv(f"{data_input_folder}land.csv")

### This is a test to see what year range we can select.  Looking at each dataset, we can count the number of unique countries present for a given year in a given dataset.

In [4]:
# Count the number of values for different years to select a year range for forecasting

yield_counts = pd.DataFrame(yield_df.drop_duplicates(['Year', 'Area']) \
                           .dropna()['Year'].value_counts().sort_index(ascending=False).head(15))
temp_counts = pd.DataFrame(temp_df.drop_duplicates(['Year', 'Area']) \
                           .dropna()['Year'].value_counts().sort_index(ascending=False).head(15))
nitrogen_counts = pd.DataFrame(nutrients_df.loc[nutrients_df['Item Code'] == 3102].drop_duplicates(['Year', 'Area']) \
                               .dropna()['Year'].value_counts().sort_index(ascending=False).head(15))
phosphate_counts = pd.DataFrame(nutrients_df.loc[nutrients_df['Item Code'] == 3103].drop_duplicates(['Year', 'Area']) \
                               .dropna()['Year'].value_counts().sort_index(ascending=False).head(15))
potash_counts = pd.DataFrame(nutrients_df.loc[nutrients_df['Item Code'] == 3104].drop_duplicates(['Year', 'Area']) \
                               .dropna()['Year'].value_counts().sort_index(ascending=False).head(15))
pesticides_counts = pd.DataFrame(pesticides_df.drop_duplicates(['Year', 'Area']) \
                           .dropna()['Year'].value_counts().sort_index(ascending=False).head(15))
agri_counts = pd.DataFrame(land_df.loc[land_df['Item Code'] == 6610].drop_duplicates(['Year', 'Area']) \
                               .dropna()['Year'].value_counts().sort_index(ascending=False).head(15))
arable_counts = pd.DataFrame(land_df.loc[land_df['Item Code'] == 6621].drop_duplicates(['Year', 'Area']) \
                               .dropna()['Year'].value_counts().sort_index(ascending=False).head(15))

#### Final results: 2013 is the latest year that we have sufficient data for each dataset.

In [5]:
merged = yield_counts.merge(temp_counts, "outer", left_index = True, right_index = True, suffixes = ["_y", None])
merged = merged.merge(nitrogen_counts, "outer", left_index = True, right_index = True, suffixes = ["_t", None])
merged = merged.merge(phosphate_counts, "outer", left_index = True, right_index = True, suffixes = ["_n", None])
merged = merged.merge(potash_counts, "outer", left_index = True, right_index = True, suffixes = ["_ph", None])
merged = merged.merge(pesticides_counts, "outer", left_index = True, right_index = True, suffixes = ["_po", None])
merged = merged.merge(agri_counts, "outer", left_index = True, right_index = True, suffixes = ["_pes", None])
merged = merged.merge(arable_counts, "outer", left_index = True, right_index = True, suffixes = ["_ag", "_ar"])

merged.sort_index(ascending=False)

Unnamed: 0,Year_y,Year_t,Year_n,Year_ph,Year_po,Year_pes,Year_ag,Year_ar
2020,,,168.0,167.0,167.0,,227.0,221.0
2019,,,168.0,168.0,168.0,,227.0,221.0
2018,,,168.0,168.0,168.0,,227.0,221.0
2017,,,168.0,168.0,168.0,,226.0,220.0
2016,204.0,,168.0,168.0,168.0,163.0,227.0,221.0
2015,204.0,,166.0,167.0,166.0,163.0,227.0,221.0
2014,204.0,,168.0,168.0,168.0,163.0,226.0,221.0
2013,204.0,137.0,168.0,168.0,168.0,163.0,226.0,220.0
2012,204.0,137.0,167.0,167.0,167.0,163.0,227.0,221.0
2011,203.0,137.0,165.0,162.0,164.0,163.0,226.0,220.0


In [6]:
# df should have unique ['year', 'area'] keys
# num_years = years to backdate the value column
# keep = list of columns to keep in output
def clean_dataset(df, years, num_years, keep):
    
    year_col = 'year'
    country_col = 'area'
    val_col = 'value'
    
    keep = [i.lower() for i in keep]
    df.columns = df.columns.str.lower()
    
    # Drop other columns
    if (year_col not in keep) or (country_col not in keep):
        print(f"Invalid 'keep' value ({*keep,}).  Must include {year_col} and {country_col}.")
        return None
    df2 = df[keep]
    
    # Drop duplicate key-entries and NaNs, reset the index
    df2 = df2.drop_duplicates([year_col, country_col]).dropna().reset_index(drop=True)
    
    #Filter by [years]
    df2 = df2.loc[(df2[year_col] >= years[0]) & (df2[year_col] <= years[1])]
    
    # Create new columns for prior years
    new_cols = []
    for i in range(num_years):
        new_cols.append(f"{val_col}_{i+1}")
        df2[new_cols[i-1]] = np.nan
    
    # Populate previous years
    for index, row in df2.iterrows():
        year = row[year_col]
        country = row[country_col]
        year_list = df.loc[df[country_col] == country][year_col].to_list()
        for j in range(len(new_cols)):
            if (year - j - 1) in year_list:
                loc_bool = (df[year_col] == (year - j - 1)) & (df[country_col] == country)
                df2.loc[index, new_cols[j]] = df.loc[loc_bool][val_col].values[0]
    
    # Drop any columns with missing data
    df2 = df2.dropna()
    
    # Throw out any country that doesn't have full data for [years]
    for country in df2[country_col].unique():
        if len(df2[df2[country_col] == country]) < years[1] - years[0] + 1:
            df2 = df2[df2[country_col] != country]
    
    return df2

In [7]:
crops = yield_df['Item'].unique()
to_keep = ['Year', 'Area', 'Item', 'Value']

yield_df_clean = pd.DataFrame(columns=[i.lower() for i in to_keep])
for crop in crops:
    clean_crop = clean_dataset(yield_df[yield_df['Item'] == crop], \
                               years=filter_years, num_years=5, keep=to_keep)
    yield_df_clean = pd.concat([yield_df_clean, clean_crop])
yield_df_clean

Unnamed: 0,year,area,item,value,value_1,value_2,value_3,value_4,value_5
47,2008,Afghanistan,Maize,26277,26277.0,26204.0,12069.0,16000.0,8400.0
48,2009,Afghanistan,Maize,21429,26277.0,26277.0,26204.0,12069.0,16000.0
49,2010,Afghanistan,Maize,16448,21429.0,26277.0,26277.0,26204.0,12069.0
50,2011,Afghanistan,Maize,16400,16448.0,21429.0,26277.0,26277.0,26204.0
51,2012,Afghanistan,Maize,21986,16400.0,16448.0,21429.0,26277.0,26277.0
...,...,...,...,...,...,...,...,...,...
2646,2009,Venezuela (Bolivarian Republic of),Plantains and others,102084,94499.0,86392.0,70096.0,88711.0,93109.0
2647,2010,Venezuela (Bolivarian Republic of),Plantains and others,99665,102084.0,94499.0,86392.0,70096.0,88711.0
2648,2011,Venezuela (Bolivarian Republic of),Plantains and others,103657,99665.0,102084.0,94499.0,86392.0,70096.0
2649,2012,Venezuela (Bolivarian Republic of),Plantains and others,107671,103657.0,99665.0,102084.0,94499.0,86392.0


In [8]:
temp_df_clean = clean_dataset(temp_df, \
                              years=filter_years, num_years=5, keep=['Year', 'Area', 'Value'])
display(temp_df_clean.head())
nitrogen_df_clean = clean_dataset(nutrients_df.loc[nutrients_df['Item Code'] == 3102], \
                                  years=filter_years, num_years=3, keep=['Year', 'Area', 'Value'])
display(nitrogen_df_clean.head())

phosphate_df_clean = clean_dataset(nutrients_df.loc[nutrients_df['Item Code'] == 3103], \
                                  years=filter_years, num_years=3, keep=['Year', 'Area', 'Value'])
display(phosphate_df_clean.head())

potash_df_clean = clean_dataset(nutrients_df.loc[nutrients_df['Item Code'] == 3104], \
                                  years=filter_years, num_years=3, keep=['Year', 'Area', 'Value'])
display(potash_df_clean.head())

pesticides_df_clean = clean_dataset(pesticides_df, \
                                  years=filter_years, num_years=3, keep=['Year', 'Area', 'Value'])
display(pesticides_df_clean.head())

agri_df_clean = clean_dataset(land_df.loc[land_df['Item Code'] == 6610], \
                                  years=filter_years, num_years=0, keep=['Year', 'Area', 'Value'])
display(agri_df_clean.head())

arable_df_clean = clean_dataset(land_df.loc[land_df['Item Code'] == 6621], \
                                  years=filter_years, num_years=0, keep=['Year', 'Area', 'Value'])
display(arable_df_clean.head())

Unnamed: 0,year,area,value,value_1,value_2,value_3,value_4,value_5
145,2008,Côte D'Ivoire,26.94,27.01,26.99,26.98,26.99,27.05
146,2009,Côte D'Ivoire,26.98,26.94,27.01,26.99,26.98,26.99
147,2010,Côte D'Ivoire,27.45,26.98,26.94,27.01,26.99,26.98
148,2011,Côte D'Ivoire,27.02,27.45,26.98,26.94,27.01,26.99
149,2012,Côte D'Ivoire,26.77,27.02,27.45,26.98,26.94,27.01


Unnamed: 0,year,area,value,value_1,value_2,value_3
45,2008,Afghanistan,14432.03,16377.29,20463.22,20466.15
46,2009,Afghanistan,14155.88,14432.03,16377.29,20463.22
47,2010,Afghanistan,12929.01,14155.88,14432.03,16377.29
48,2011,Afghanistan,22970.63,12929.01,14155.88,14432.03
49,2012,Afghanistan,17065.25,22970.63,12929.01,14155.88


Unnamed: 0,year,area,value,value_1,value_2,value_3
39,2008,Afghanistan,0.0,0.0,4406.3,9202.64
40,2009,Afghanistan,556.23,0.0,0.0,4406.3
41,2010,Afghanistan,925.85,556.23,0.0,0.0
42,2011,Afghanistan,6845.57,925.85,556.23,0.0
43,2012,Afghanistan,1261.44,6845.57,925.85,556.23


Unnamed: 0,year,area,value,value_1,value_2,value_3
11,2008,Afghanistan,0.0,0.0,0.0,105.6
12,2009,Afghanistan,0.0,0.0,0.0,0.0
13,2010,Afghanistan,0.0,0.0,0.0,0.0
14,2011,Afghanistan,0.0,0.0,0.0,0.0
15,2012,Afghanistan,196.78,0.0,0.0,0.0


Unnamed: 0,year,area,value,value_1,value_2,value_3
18,2008,Albania,1069.54,1006.57,943.61,880.64
19,2009,Albania,1132.5,1069.54,1006.57,943.61
20,2010,Albania,1311.17,1132.5,1069.54,1006.57
21,2011,Albania,1302.63,1311.17,1132.5,1069.54
22,2012,Albania,766.25,1302.63,1311.17,1132.5


Unnamed: 0,year,area,value
47,2008,Afghanistan,37910.0
48,2009,Afghanistan,37910.0
49,2010,Afghanistan,37911.0
50,2011,Afghanistan,37910.0
51,2012,Afghanistan,37910.0


Unnamed: 0,year,area,value
47,2008,Afghanistan,7794.0
48,2009,Afghanistan,7793.0
49,2010,Afghanistan,7793.0
50,2011,Afghanistan,7791.0
51,2012,Afghanistan,7790.0


# Country Name Comparisons

## Any country that doesn't appear in every dataset can be excluded from the final dataset.  In many cases, an INNER JOIN on "area" will be sufficient to rule out those countries.  However, we do want to correct name differences for identical countries.  

In [9]:
# Return the disjoint of two dataframes, marked with which dataframe they came from.
def disjoint_dfs(df1, df2):
    left_only = pd.merge(df1, df2, how='outer', indicator=True).loc[lambda x : x['_merge']=='left_only']
    right_only = pd.merge(df1, df2, how='outer', indicator=True).loc[lambda x : x['_merge']=='right_only']
    return pd.merge(left_only, right_only, how='outer')

# Return the inner join of two datasets
def innerjoin_dfs(df1, df2):
    return pd.merge(df1, df2, how='inner')

In [10]:
agri_co = pd.DataFrame(agri_df_clean['area'].unique())
arable_co = pd.DataFrame(arable_df_clean['area'].unique())

nitrogen_co = pd.DataFrame(nitrogen_df_clean['area'].unique())
phosphate_co = pd.DataFrame(phosphate_df_clean['area'].unique())
potash_co = pd.DataFrame(potash_df_clean['area'].unique())

yield_co = pd.DataFrame(yield_df_clean['area'].unique())

pesticides_co = pd.DataFrame(pesticides_df_clean['area'].unique())

temp_co = pd.DataFrame(temp_df_clean['area'].unique())

### For the two land use datasets, arable_df is a subset of agri_df, so an INNER JOIN will be sufficient to rule out the extra countries in agri_df.  Future comparisons should be made against merged_co.

In [11]:
# Compare agri_df to arable_df
disjoint_dfs(agri_co, arable_co)

Unnamed: 0,0,_merge
0,Falkland Islands (Malvinas),left_only
1,Greenland,left_only
2,Nauru,left_only
3,Norfolk Island,left_only
4,Tokelau,left_only
5,Tuvalu,left_only


In [12]:
merged_co = innerjoin_dfs(agri_co, arable_co)
merged_co

Unnamed: 0,0
0,Afghanistan
1,Albania
2,Algeria
3,American Samoa
4,Andorra
...,...
214,Wallis and Futuna Islands
215,Western Sahara
216,Yemen
217,Zambia


### For the three nutrient dataframes, there is no country present in all three dataframes that is simply misnamed.  phosphate_df and potash_df are both non-overlapping subsets of nitrogen_df.

In [13]:
# Compare nitrogen_df to phosphate_df
print(disjoint_dfs(nitrogen_co, phosphate_co))
# Compare phosphate_df to potash_df
print(disjoint_dfs(nitrogen_co, potash_co))
# Compare phosphate_df to potash_df
print(disjoint_dfs(phosphate_co, potash_co))

                     0     _merge
0  Antigua and Barbuda  left_only
1             Mongolia  left_only
2                Qatar  left_only
3               Rwanda  left_only
4           Seychelles  left_only
5                Yemen  left_only
                     0     _merge
0  Antigua and Barbuda  left_only
1                Congo  left_only
2             Ethiopia  left_only
3               Guyana  left_only
4           Kyrgyzstan  left_only
5             Maldives  left_only
6             Mongolia  left_only
7                Nepal  left_only
8                Qatar  left_only
9               Rwanda  left_only
            0      _merge
0       Congo   left_only
1    Ethiopia   left_only
2      Guyana   left_only
3  Kyrgyzstan   left_only
4    Maldives   left_only
5       Nepal   left_only
6  Seychelles  right_only
7       Yemen  right_only


### Comparing the nutrient datasets to the merged list, they are all subsets of the merged list, so an INNER JOIN is sufficient.

In [14]:
# Compare nitrogen_df to merged
disjoint_dfs(phosphate_co, merged_co)

Unnamed: 0,0,_merge
0,American Samoa,right_only
1,Andorra,right_only
2,Antigua and Barbuda,right_only
3,Aruba,right_only
4,Bahamas,right_only
...,...,...
60,United States Virgin Islands,right_only
61,Vanuatu,right_only
62,Wallis and Futuna Islands,right_only
63,Western Sahara,right_only


In [15]:
merged_co = innerjoin_dfs(merged_co, phosphate_co)
merged_co = innerjoin_dfs(merged_co, potash_co)
merged_co

Unnamed: 0,0
0,Afghanistan
1,Albania
2,Algeria
3,Angola
4,Argentina
...,...
143,Uzbekistan
144,Venezuela (Bolivarian Republic of)
145,Viet Nam
146,Zambia


### Comparing the nutrient datasets to yield_df, there are three corrections that need to be made.  The name in all datasets should be updated.
#### Turkey = Türkiye
#### United Kingdom = United Kingdom of Great Britain and Northern Ireland
#### North Macedonia = The former Yugoslav Republic of Macedonia

In [16]:
# Compare phosphate_df to yield_df
disjoint_dfs(merged_co, yield_co)

Unnamed: 0,0,_merge
0,North Macedonia,left_only
1,Türkiye,left_only
2,United Kingdom of Great Britain and Northern I...,left_only
3,Antigua and Barbuda,right_only
4,Bahamas,right_only
5,Botswana,right_only
6,Cabo Verde,right_only
7,Chad,right_only
8,Comoros,right_only
9,Congo,right_only


In [17]:
merged_co[0] = merged_co[0].str.replace("Türkiye", "Turkey")
merged_co[0] = merged_co[0].str.replace("United Kingdom of Great Britain and Northern Ireland", "United Kingdom")
yield_co[0] = yield_co[0].str.replace("The former Yugoslav Republic of Macedonia", "North Macedonia")
merged_co = innerjoin_dfs(merged_co, yield_co)
merged_co

Unnamed: 0,0
0,Afghanistan
1,Albania
2,Algeria
3,Angola
4,Argentina
...,...
143,Uzbekistan
144,Venezuela (Bolivarian Republic of)
145,Viet Nam
146,Zambia


### Of the mismatches here, one we can recognize ("North Macedonia"), which will already be a name change for us.  Other potential matches:

#### Congo = Democratic Republic of the Congo.  The "Republic of the Congo" is a separate country, but the phrase "Congo" traditionally refers to the DRC, so this is an okay rule to add.  To avoid ambiguity, the longer name will be kept.

#### China - there are a few "China" references in the merged, and I've seen "China, mainland" as well, but that doesn't seem to be present in pesticides_df.  "China" seems to refer to a total encompassing country - it should be dropped and then, at the end, the "China (mainland)" entry can be kept.
##### However, we *MUST* make sure to remove the plain "China" rows in the nutrient, land use, and yield  datasets before renaming all the "China, mainland" entries to "China"

In [18]:
disjoint_dfs(pesticides_co, merged_co)

Unnamed: 0,0,_merge
0,Antigua and Barbuda,left_only
1,Bahamas,left_only
2,Bermuda,left_only
3,Botswana,left_only
4,Cabo Verde,left_only
5,Chad,left_only
6,"China, Hong Kong SAR",left_only
7,"China, Macao SAR",left_only
8,Comoros,left_only
9,Congo,left_only


In [19]:
pesticides_co[0] = pesticides_co[0].str.replace("The former Yugoslav Republic of Macedonia", "North Macedonia")
pesticides_co[0] = pesticides_co[0].str.replace("Congo", "Democratic Republic of the Congo")
merged_co = innerjoin_dfs(merged_co, pesticides_co)
merged_co

Unnamed: 0,0
0,Albania
1,Algeria
2,Angola
3,Argentina
4,Armenia
...,...
129,Uruguay
130,Venezuela (Bolivarian Republic of)
131,Viet Nam
132,Zambia


### temp_df has quite a few name changes, some of which are better names.  Updating all dfs will be ideal, again.

#### Bolivia = Bolivia (Plurinational State of)
#### China = China (mainland)
#### Côte d'Ivoire = Côte D'Ivoire
#### Czechia = Czech Republic
#### Democratic Republic of the Congo = Congo (Democratic Republic Of The)
#### Iran = Iran (Islamic Republic of)
#### Moldova = Republic of Moldova
#### Russia = Russian Federation
#### Syria = Syrian Arab Republic
#### South Korea = Republic of Korea
#### Taiwan = China, Taiwan Province of
#### Tanzania = United Republic of Tanzania	
#### United States = United States of America
#### Venezuela = Venezuela (Bolivarian Republic of)
#### Vietnam = Viet Nam

In [20]:
# Compare merged_co to temp_co
with pd.option_context('display.max_rows', None,):
    display(disjoint_dfs(merged_co, temp_co))

Unnamed: 0,0,_merge
0,Barbados,left_only
1,Belize,left_only
2,Bhutan,left_only
3,Bolivia (Plurinational State of),left_only
4,Brunei Darussalam,left_only
5,"China, mainland",left_only
6,"China, Taiwan Province of",left_only
7,Cook Islands,left_only
8,Costa Rica,left_only
9,Côte d'Ivoire,left_only


In [21]:
merged_co[0] = merged_co[0].str.replace("Bolivia (Plurinational State of)", "Bolivia", regex=False)
merged_co[0] = merged_co[0].str.replace("China, mainland", "China", regex=False)
merged_co[0] = merged_co[0].str.replace("Iran (Islamic Republic of)", "Iran", regex=False)
merged_co[0] = merged_co[0].str.replace("Republic of Moldova", "Moldova")
merged_co[0] = merged_co[0].str.replace("Russian Federation", "Russia")
merged_co[0] = merged_co[0].str.replace("Republic of Korea", "South Korea")
merged_co[0] = merged_co[0].str.replace("Syrian Arab Republic", "Syria")
merged_co[0] = merged_co[0].str.replace("China, Taiwan Province of", "Taiwan")
merged_co[0] = merged_co[0].str.replace("United Republic of Tanzania", "Tanzania")
merged_co[0] = merged_co[0].str.replace("United States of America", "United States")
merged_co[0] = merged_co[0].str.replace("Venezuela (Bolivarian Republic of)", "Venezuela", regex=False)
merged_co[0] = merged_co[0].str.replace("Viet Nam", "Vietnam")
temp_co[0] = temp_co[0].str.replace("Côte D'Ivoire", "Côte d'Ivoire")
temp_co[0] = temp_co[0].str.replace("Czech Republic", "Czechia")
temp_co[0] = temp_co[0].str.replace("Congo (Democratic Republic Of The)", "Democratic Republic of the Congo", regex=False)
temp_co[0] = temp_co[0].str.replace("Macedonia", "North Macedonia")
merged_co = innerjoin_dfs(merged_co, temp_co)
merged_co

Unnamed: 0,0
0,Albania
1,Algeria
2,Angola
3,Argentina
4,Armenia
...,...
101,Uruguay
102,Venezuela
103,Vietnam
104,Zambia


### After cleaning up country names, we are left with 106 valid countries to compare.

In [22]:
print(merged_co[0].sort_values().to_list())

['Albania', 'Algeria', 'Angola', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Belarus', 'Belgium', 'Bolivia', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cameroon', 'Canada', 'Central African Republic', 'Chile', 'China', 'Colombia', 'Croatia', 'Czechia', "Côte d'Ivoire", 'Democratic Republic of the Congo', 'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia', 'Finland', 'France', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Honduras', 'Hungary', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Kazakhstan', 'Kenya', 'Latvia', 'Lebanon', 'Libya', 'Lithuania', 'Madagascar', 'Malawi', 'Malaysia', 'Mali', 'Mauritius', 'Mexico', 'Moldova', 'Morocco', 'Mozambique', 'Namibia', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'North Macedonia', 'Norway', 'Pakistan', 'Papua New Guinea', 'Peru', 'Poland', 'Portugal', 'Romania', 'Russia', 'Saudi Arabia', 'Senegal', 'Slovakia', 'Slove

In [23]:
# Modification of original dataframes, in accordance with testing

def remove_china(df):
    # Remove exact "China" rows first, before renaming "China, mainland" to "China"
    return(df[df['area'] != "China"])

def clean_countries(df):
    df['area'] = df['area'].str.replace("Türkiye", "Turkey")
    df['area'] = df['area'].str.replace("United Kingdom of Great Britain and Northern Ireland", "United Kingdom")
    df['area'] = df['area'].str.replace("The former Yugoslav Republic of Macedonia", "North Macedonia")
    df['area'] = df['area'].str.replace("Bolivia (Plurinational State of)", "Bolivia", regex=False)
    df['area'] = df['area'].str.replace("China, mainland", "China", regex=False)
    df['area'] = df['area'].str.replace("Iran (Islamic Republic of)", "Iran", regex=False)
    df['area'] = df['area'].str.replace("Republic of Moldova", "Moldova")
    df['area'] = df['area'].str.replace("Russian Federation", "Russia")
    df['area'] = df['area'].str.replace("Republic of Korea", "South Korea")
    df['area'] = df['area'].str.replace("Syrian Arab Republic", "Syria")
    df['area'] = df['area'].str.replace("China, Taiwan Province of", "Taiwan")
    df['area'] = df['area'].str.replace("United Republic of Tanzania", "Tanzania")
    df['area'] = df['area'].str.replace("United States of America", "United States")
    df['area'] = df['area'].str.replace("Venezuela (Bolivarian Republic of)", "Venezuela", regex=False)
    df['area'] = df['area'].str.replace("Viet Nam", "Vietnam")
    df['area'] = df['area'].str.replace("Côte D'Ivoire", "Côte d'Ivoire")
    df['area'] = df['area'].str.replace("Czech Republic", "Czechia")
    df['area'] = df['area'].str.replace("Congo (Democratic Republic Of The)", "Democratic Republic of the Congo", regex=False)
    return df

# Don't remove China from temp_df - it doesn't have "China, mainland"
arable_df_clean, agri_df_clean, nitrogen_df_clean, phosphate_df_clean, potash_df_clean, yield_df_clean, pesticides_df_clean = \
    [remove_china(df) for df in [arable_df_clean, agri_df_clean, nitrogen_df_clean, phosphate_df_clean, potash_df_clean, \
               yield_df_clean, pesticides_df_clean]]

arable_df_clean, agri_df_clean, nitrogen_df_clean, phosphate_df_clean, potash_df_clean, yield_df_clean, pesticides_df_clean, temp_df_clean = \
    [clean_countries(df) for df in [arable_df_clean, agri_df_clean, nitrogen_df_clean, phosphate_df_clean, potash_df_clean, \
    yield_df_clean, pesticides_df_clean, temp_df_clean]]
 
# Clean pesticides_df and temp_df separately for "Congo" and "Macedonia", as they would trigger in other datasets
pesticides_df_clean['area'] = pesticides_df_clean['area'].str.replace("Congo", "Democratic Republic of the Congo")
temp_df_clean['area'] = temp_df_clean['area'].str.replace("Macedonia", "North Macedonia")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['area'] = df['area'].str.replace("Türkiye", "Turkey")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['area'] = df['area'].str.replace("United Kingdom of Great Britain and Northern Ireland", "United Kingdom")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['area'] = df['area'].str.replace("T

In [24]:
agri_co = pd.DataFrame(agri_df_clean['area'].unique())
arable_co = pd.DataFrame(arable_df_clean['area'].unique())

nitrogen_co = pd.DataFrame(nitrogen_df_clean['area'].unique())
phosphate_co = pd.DataFrame(phosphate_df_clean['area'].unique())
potash_co = pd.DataFrame(potash_df_clean['area'].unique())

yield_co = pd.DataFrame(yield_df_clean['area'].unique())

pesticides_co = pd.DataFrame(pesticides_df_clean['area'].unique())

temp_co = pd.DataFrame(temp_df_clean['area'].unique())

merged_co = innerjoin_dfs(agri_co, \
            innerjoin_dfs(arable_co, \
            innerjoin_dfs(nitrogen_co, \
            innerjoin_dfs(phosphate_co, \
            innerjoin_dfs(potash_co, \
            innerjoin_dfs(yield_co, \
            innerjoin_dfs(pesticides_co, temp_co)))))))[0].sort_values()
display(merged_co)
print(merged_co.to_list())

0        Albania
1        Algeria
2         Angola
3      Argentina
4        Armenia
         ...    
101      Uruguay
102    Venezuela
103      Vietnam
104       Zambia
105     Zimbabwe
Name: 0, Length: 106, dtype: object

['Albania', 'Algeria', 'Angola', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahrain', 'Bangladesh', 'Belarus', 'Belgium', 'Bolivia', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cameroon', 'Canada', 'Central African Republic', 'Chile', 'China', 'Colombia', 'Croatia', 'Czechia', "Côte d'Ivoire", 'Democratic Republic of the Congo', 'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia', 'Finland', 'France', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Honduras', 'Hungary', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Kazakhstan', 'Kenya', 'Latvia', 'Lebanon', 'Libya', 'Lithuania', 'Madagascar', 'Malawi', 'Malaysia', 'Mali', 'Mauritius', 'Mexico', 'Moldova', 'Morocco', 'Mozambique', 'Namibia', 'Netherlands', 'New Zealand', 'Nicaragua', 'Niger', 'North Macedonia', 'Norway', 'Pakistan', 'Papua New Guinea', 'Peru', 'Poland', 'Portugal', 'Romania', 'Russia', 'Saudi Arabia', 'Senegal', 'Slovakia', 'Slove

In [25]:
#load in datasets
# Yields
yield_df_clean.to_csv(f"{data_output_folder}yield_clean.csv", index=False)

# Average Temperature (C)
temp_df_clean.to_csv(f"{data_output_folder}temp_clean.csv", index=False)

# Nutrients in fertilizer used for Agricultural Use
# Item Codes 3102 = Nitrogen, 3103 = Phosphate, 3104 = Potash
nitrogen_df_clean.to_csv(f"{data_output_folder}nitrogen_clean.csv", index=False)
phosphate_df_clean.to_csv(f"{data_output_folder}phosphate_clean.csv", index=False)
potash_df_clean.to_csv(f"{data_output_folder}potash_clean.csv", index=False)

# Pesticides
pesticides_df.to_csv(f"{data_output_folder}pesticides_clean.csv", index=False)

# Land Use
# Item Codes 6610 = Agricultural Land, 6621 = Arable Land
agri_df_clean.to_csv(f"{data_output_folder}agri_clean.csv", index=False)
arable_df_clean.to_csv(f"{data_output_folder}arable_clean.csv", index=False)