<a href="https://colab.research.google.com/github/stegossauro/stegossauro/blob/main/World_gpd_analysis_visualization_randomforest_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
sazidthe1_world_gdp_growth_path = kagglehub.dataset_download('sazidthe1/world-gdp-growth')

print('Data source import complete.')


In this project, I will demonstrate various methods for visual analysis of this dataset, as well as an example of prediction on it. I will organize the code in such a way that it is easy to reuse and modify as needed

The first step involves importing the necessary libraries and defining a dictionary to enable a comparative analysis between different continents using this dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv("/kaggle/input/world-gdp-growth/world_gdp_data.csv", encoding='latin-1')

continents = {
    "Asia": ["Afghanistan", "Armenia", "Azerbaijan", "Bahrain", "Bangladesh", "Bhutan", "Brunei Darussalam", "Cambodia",
             "China, People's Republic of", "Georgia", "Hong Kong SAR", "India", "Indonesia", "Iran", "Iraq", "Israel",
             "Japan", "Jordan", "Kazakhstan", "Korea, Republic of", "Kuwait", "Kyrgyz Republic", "Lao P.D.R.", "Lebanon",
             "Macao SAR", "Malaysia", "Maldives", "Mongolia", "Myanmar", "Nepal", "Oman", "Pakistan", "Philippines",
             "Qatar", "Saudi Arabia", "Singapore", "Sri Lanka", "Syria", "Taiwan Province of China", "Tajikistan",
             "Thailand", "Timor-Leste", "Turkey", "Turkmenistan", "United Arab Emirates", "Uzbekistan", "Vietnam", "West Bank and Gaza", "Yemen"],
    "Europe": ["Albania", "Andorra", "Armenia", "Austria", "Azerbaijan", "Belarus", "Belgium", "Bosnia and Herzegovina",
               "Bulgaria", "Croatia", "Cyprus", "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Georgia",
               "Germany", "Greece", "Hungary", "Iceland", "Ireland", "Italy", "Kazakhstan", "Kosovo", "Latvia", "Liechtenstein",
               "Lithuania", "Luxembourg", "Malta", "Moldova", "Monaco", "Montenegro", "Netherlands", "North Macedonia",
               "Norway", "Poland", "Portugal", "Romania", "Russia", "San Marino", "Serbia", "Slovak Republic", "Slovenia",
               "Spain", "Sweden", "Switzerland", "Turkey", "Ukraine", "United Kingdom"],
    "Africa": ["Algeria", "Angola", "Benin", "Botswana", "Burkina Faso", "Burundi", "Cabo Verde", "Cameroon", "Central African Republic",
               "Chad", "Comoros", "Congo, Dem. Rep. of the", "Congo, Republic of the", "Côte d'Ivoire", "Djibouti", "Egypt", "Equatorial Guinea",
               "Eritrea", "Eswatini", "Ethiopia", "Gabon", "Gambia, The", "Ghana", "Guinea", "Guinea-Bissau", "Kenya", "Lesotho", "Liberia", "Libya",
               "Madagascar", "Malawi", "Mali", "Mauritania", "Mauritius", "Morocco", "Mozambique", "Namibia", "Niger", "Nigeria", "Rwanda", "São Tomé and Príncipe",
               "Senegal", "Seychelles", "Sierra Leone", "Somalia", "South Africa", "South Sudan, Republic of", "Sudan", "Tanzania", "Togo", "Tunisia", "Uganda",
               "Zambia", "Zimbabwe"],
    "North America": ["Antigua and Barbuda", "Bahamas, The", "Barbados", "Belize", "Canada", "Costa Rica", "Cuba", "Dominica", "Dominican Republic",
                      "El Salvador", "Grenada", "Guatemala", "Haiti", "Honduras", "Jamaica", "Mexico", "Nicaragua", "Panama", "Saint Kitts and Nevis",
                      "Saint Lucia", "Saint Vincent and the Grenadines", "Trinidad and Tobago", "United States"],
    "South America": ["Argentina", "Bolivia", "Brazil", "Chile", "Colombia", "Ecuador", "Guyana", "Paraguay", "Peru", "Suriname", "Uruguay", "Venezuela"],
    "Oceania": ["Australia", "Fiji", "Kiribati", "Marshall Islands", "Micronesia, Fed. States of", "Nauru", "New Zealand", "Palau", "Papua New Guinea",
                "Samoa", "Solomon Islands", "Tonga", "Tuvalu", "Vanuatu"]
}

df['Continent'] = df['country_name'].apply(lambda x: next((continent for continent, countries in continents.items() if x in countries), None))



The following code segment represents an analysis and visualization by continents of the presence of null data, meaning the absence of data from the dataset. These null values typically can influence and cause inconsistencies in the final results. Additionally, with this implemented method, we can observe which regions have the most missing data.

In [None]:
results = {}

for continent, countries in continents.items():
    df_continent = df[df['Continent'] == continent]

    total_null_data = df_continent.isnull().sum().sum()

    total_existing_data = df_continent.size - total_null_data

    percent_null_data = (total_null_data / (total_null_data + total_existing_data)) * 100
    percent_existing_data = 100 - percent_null_data

    results[continent] = {
        'existing_data': total_existing_data,
        'null_data': total_null_data,
        'percent_null_data': percent_null_data,
        'percent_existing_data': percent_existing_data
    }

for continent, result in results.items():
    print(f"Continent {continent}: total existing data = {result['existing_data']}, null data = {result['null_data']}, {result['percent_null_data']:.2f}% of values are null")

fig, axs = plt.subplots(2, 3, figsize=(15, 10))

for i, (continent, result) in enumerate(results.items()):
    percent_null_data = result['percent_null_data']
    percent_existing_data = result['percent_existing_data']
    row = i // 3
    col = i % 3
    axs[row, col].bar(['Existing', 'Null'], [percent_existing_data, percent_null_data], color=['green', 'red'])
    axs[row, col].set_title(f'{continent}')
    axs[row, col].set_ylabel('Percentage')
    axs[row, col].set_ylim([0, 100])

fig.suptitle('Percentage of null values by continent', fontsize=16)
plt.tight_layout()
plt.show()


In the next step, a visualization of the average GDP evolution per continent is implemented in two separate graphs.

In [None]:
for col in df.columns[2:]:
    df[col] = pd.to_numeric(df[col], errors='coerce')

plt.figure(figsize=(12, 8))

for continent, countries in continents.items():
    df_continent = df[df['country_name'].isin(countries)]
    avg_gdp = df_continent.iloc[:, 2:].mean(axis=0, skipna=True)
    years = avg_gdp.index.tolist()
    avg_gdp_values = avg_gdp.values.tolist()
    plt.plot(years, avg_gdp_values, marker='o', label=continent)

plt.legend()
plt.title('Average GDP Evolution per Continent')
plt.xlabel('Year')
plt.ylabel('Average GDP')
plt.xticks(years[::5])
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12, 8))

for i, (continent, countries) in enumerate(continents.items(), start=1):
    plt.subplot(2, 3, i)
    df_continent = df[df['country_name'].isin(countries)]
    avg_gdp = df_continent.iloc[:, 2:].mean(axis=0, skipna=True)
    years = avg_gdp.index.tolist()
    avg_gdp_values = avg_gdp.values.tolist()
    plt.plot(years, avg_gdp_values, marker='o')
    plt.xticks(years[::10])
    plt.title(continent)

plt.suptitle('Average GDP Evolution per Continent', fontsize=16)
plt.tight_layout()
plt.show()


This type of visualization serves as an excellent starting point for any analysis. Interesting trends over the past 40 years can be observed, with the most intriguing aspect being how clearly the global average GDP per continent has been influenced by the pandemic.

Here, I have created an example for the same type of graph but in detail for individual countries rather than continents.

In [None]:
# Here you can change the names of the countries you want to visualize for
south_american_countries = ["Brazil", "Argentina", "Chile", "Colombia", "Peru", "Uruguay"]


plt.figure(figsize=(15, 8))
for i, country in enumerate(south_american_countries, start=1):

    plt.subplot(2, 3, i)
    df_country = df[df['country_name'] == country]
    years = df.columns[2:]
    gdp_values = df_country.iloc[:, 2:].values.flatten()
    plt.plot(years, gdp_values, marker='o')
    plt.xticks(years[::10])
    plt.title(country)
plt.suptitle('GDP Evolution in South American Countries', fontsize=16)
plt.tight_layout()

plt.show()

plt.figure(figsize=(10, 6))
for country in south_american_countries:
    df_country = df[df['country_name'] == country]
    years = df.columns[2:]
    gdp_values = df_country.iloc[:, 2:].mean(axis=0, skipna=True)
    plt.plot(years, gdp_values, marker='o', label=country)

plt.title('Comparison of Average GDP Evolution in South American Countries')
plt.xlabel('Year')
plt.ylabel('GDP')
plt.legend()
plt.xticks(years[::5])
plt.grid(True)
plt.show()


The last part of the code represents a prediction for the next 3 years using RandomForest and generating a top 3 bottom 3 per continent to observe, according to the prediction, which countries will experience the highest GDP growth or the highest decline for each continent. Importantly, since we have already observed the presence of null values that can influence the prediction of a time series, we have implemented a method to avoid errors. Additionally, if the null values per country are too many, in this case > 8, they will not be considered conclusive, and those countries will not be included in the final result but will be displayed afterward.

In [None]:
predictions = []
ignored_countries = []

for continent, countries in continents.items():
    continent_data = df[df['country_name'].isin(countries)]
    continent_predictions = []

    for country_name, country_data in continent_data.iterrows():
        null_count = country_data[2:].isnull().sum()
        if null_count > 8:  # Ignore countries with more than 8 null values
            ignored_countries.append(country_name)
            continue

        numeric_data = country_data[2:].dropna().astype(float)
        start_year_idx = numeric_data.index[0]
        years = np.array(numeric_data.index.astype(int)).reshape(-1, 1)
        gdp_values = numeric_data.values.reshape(-1, 1)
        features = np.concatenate([years, gdp_values], axis=1)
        imputer = SimpleImputer(strategy='mean')
        features_imputed = imputer.fit_transform(features)
        rf_regressor = RandomForestRegressor()
        rf_regressor.fit(features_imputed, gdp_values.ravel())
        future_gdp_values = []
        last_known_value = gdp_values[-1][0]
        for year in range(2025, 2028):
            current_features = np.array([[year, last_known_value]])
            predicted_gdp = rf_regressor.predict(current_features)[0]
            future_gdp_values.append(predicted_gdp)
            last_known_value = predicted_gdp
        avg_growth = np.mean(future_gdp_values)
        continent_predictions.append((country_name, avg_growth, future_gdp_values))

    continent_predictions.sort(key=lambda x: x[1], reverse=True)

    print(f"\nTop 3 countries with the highest GDP growth in {continent}:")
    for i in range(min(3, len(continent_predictions))):
        print("Country:", df.loc[continent_predictions[i][0], "country_name"])
        print("Predictions for the upcoming years:")
        for year, gdp_value in zip([2025, 2026, 2027], continent_predictions[i][2]):
            print(f"{year}: {gdp_value}")
        print()

    print(f"\nBottom 3 countries with the lowest GDP growth in {continent}:")
    for i in range(max(0, len(continent_predictions) - 3), len(continent_predictions)):
        print("Country:", df.loc[continent_predictions[i][0], "country_name"])
        print("Predictions for the upcoming years:")
        for year, gdp_value in zip([2025, 2026, 2027], continent_predictions[i][2]):
            print(f"{year}: {gdp_value}")
        print()


print("\nIgnored countries for prediction (more than 8 missing values):")
for index in ignored_countries:
    print(df.loc[index, "country_name"])
