In [1]:
# Cell 1
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from scipy import stats
from scipy.spatial.distance import cdist
import random
import warnings
warnings.filterwarnings('ignore')

print("World Economic Data Analysis - Machine Learning Edition")
print("=" * 60)

World Economic Data Analysis - Machine Learning Edition


In [2]:
# Cell 2
df = pd.read_csv('all_countries_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())

print("\nMissing values:")
print(df.isnull().sum())

# Cell 3
print("\n" + "="*50)
print("DATA PREPROCESSING")
print("="*50)

Dataset shape: (167, 6)
Columns: ['country', 'unemployment_rate', 'tourism_millions', 'gdp_per_capita', 'cost_of_living', 'corruption_index']

First few rows:
          country  unemployment_rate  tourism_millions  gdp_per_capita  \
0     Afghanistan                NaN               NaN             NaN   
1         Albania                NaN               2.7             NaN   
2         Algeria                NaN               NaN             NaN   
3  American Samoa               29.8               NaN             NaN   
4          Angola                NaN               NaN             NaN   

   cost_of_living  corruption_index  
0             NaN              84.0  
1            52.0               NaN  
2            29.9              67.0  
3             NaN               NaN  
4            23.0              71.0  

Missing values:
country                0
unemployment_rate    103
tourism_millions     126
gdp_per_capita       117
cost_of_living        60
corruption_index      57
d

In [3]:
# Cell 3
print("\n" + "="*50)
print("DATA PREPROCESSING")
print("="*50)

df_analysis = df.copy()

numerical_cols = ['unemployment_rate', 'tourism_millions', 'gdp_per_capita', 
                  'cost_of_living', 'corruption_index']

for col in numerical_cols:
    median_val = df_analysis[col].median()
    df_analysis[col] = df_analysis[col].fillna(median_val)

print("Missing values handled using median imputation")
print(f"Complete cases now: {df_analysis.dropna().shape[0]}")

print("\nBasic Statistics:")
print(df_analysis[numerical_cols].describe())


DATA PREPROCESSING
Missing values handled using median imputation
Complete cases now: 167

Basic Statistics:
       unemployment_rate  tourism_millions  gdp_per_capita  cost_of_living  \
count         167.000000        167.000000      167.000000      167.000000   
mean            6.082036          5.845509    49343.598802       57.649102   
std             5.597335         10.670739    12724.110219       27.756742   
min             0.100000          0.000000    30422.000000       22.800000   
25%             5.000000          4.100000    47316.000000       41.900000   
50%             5.000000          4.100000    47316.000000       48.900000   
75%             5.000000          4.100000    47316.000000       60.450000   
max            36.000000        117.100000   134754.000000      157.600000   

       corruption_index  
count        167.000000  
mean          62.544910  
std           18.559805  
min           12.000000  
25%           62.500000  
50%           69.000000  
75%  

In [4]:
# Cell 4
print("\n" + "="*50)
print("EXPLORATORY DATA ANALYSIS")
print("="*50)

correlation_matrix = df_analysis[numerical_cols].corr()

fig_corr = px.imshow(correlation_matrix, 
                     text_auto=True, 
                     aspect="auto",
                     title="Economic Indicators Correlation Matrix",
                     color_continuous_scale='RdBu')
fig_corr.show()


EXPLORATORY DATA ANALYSIS


In [5]:
# Cell 5
# Original histograms (overview)
fig_dist = make_subplots(
    rows=2, cols=3,
    subplot_titles=('GDP per Capita', 'Cost of Living', 'Corruption Index', 
                   'Unemployment Rate', 'Tourism (Millions)'),
    specs=[[{'secondary_y': False}, {'secondary_y': False}, {'secondary_y': False}],
           [{'secondary_y': False}, {'secondary_y': False}, {'type': 'xy'}]]
)

fig_dist.add_trace(go.Histogram(x=df_analysis['gdp_per_capita'], name='GDP per Capita'), row=1, col=1)
fig_dist.add_trace(go.Histogram(x=df_analysis['cost_of_living'], name='Cost of Living'), row=1, col=2)
fig_dist.add_trace(go.Histogram(x=df_analysis['corruption_index'], name='Corruption Index'), row=1, col=3)
fig_dist.add_trace(go.Histogram(x=df_analysis['unemployment_rate'], name='Unemployment Rate'), row=2, col=1)
fig_dist.add_trace(go.Histogram(x=df_analysis['tourism_millions'], name='Tourism'), row=2, col=2)

fig_dist.update_layout(height=600, showlegend=False, title_text="Distribution of Economic Indicators")
fig_dist.show()

# Individual scatter plots showing ALL country names
print("Interactive Plots with All Country Names:")
print("=" * 50)

# GDP per Capita - scatter plot with all countries
fig_gdp = px.scatter(df_analysis, x='country', y='gdp_per_capita',
                    hover_data=['country', 'gdp_per_capita'],
                    title='GDP per Capita by Country (Hover to see values)',
                    labels={'gdp_per_capita': 'GDP per Capita'})
fig_gdp.update_xaxes(tickangle=45)
fig_gdp.update_layout(height=500, xaxis_title="Countries")
fig_gdp.show()

# Cost of Living - scatter plot with all countries
fig_cost = px.scatter(df_analysis, x='country', y='cost_of_living',
                     hover_data=['country', 'cost_of_living'],
                     title='Cost of Living by Country (Hover to see values)',
                     labels={'cost_of_living': 'Cost of Living Index'})
fig_cost.update_xaxes(tickangle=45)
fig_cost.update_layout(height=500, xaxis_title="Countries")
fig_cost.show()

# Corruption Index - scatter plot with all countries
fig_corruption = px.scatter(df_analysis, x='country', y='corruption_index',
                           hover_data=['country', 'corruption_index'],
                           title='Corruption Index by Country (Hover to see values)',
                           labels={'corruption_index': 'Corruption Index'})
fig_corruption.update_xaxes(tickangle=45)
fig_corruption.update_layout(height=500, xaxis_title="Countries")
fig_corruption.show()

# Alternative: Strip plots (better for seeing all countries)
print("\nStrip Plots - All Countries Visible:")
print("=" * 40)

# GDP Strip Plot
fig_strip_gdp = px.strip(df_analysis, y='gdp_per_capita',
                        hover_data=['country'],
                        title='GDP per Capita Distribution - All Countries')
fig_strip_gdp.show()

# Box plot with individual points
fig_box_gdp = px.box(df_analysis, y='gdp_per_capita',
                    points="all",
                    hover_data=['country'],
                    title='GDP per Capita Distribution with All Countries')
fig_box_gdp.show()

# Show extreme values with country names
print("\nCountries with Extreme Values:")
print("=" * 30)

print("\nTop 10 Highest GDP per Capita:")
top_gdp = df_analysis.nlargest(10, 'gdp_per_capita')[['country', 'gdp_per_capita']]
for idx, row in top_gdp.iterrows():
    print(f"  {row['country']}: ${row['gdp_per_capita']:,.0f}")

print("\nTop 10 Lowest GDP per Capita:")
bottom_gdp = df_analysis.nsmallest(10, 'gdp_per_capita')[['country', 'gdp_per_capita']]
for idx, row in bottom_gdp.iterrows():
    print(f"  {row['country']}: ${row['gdp_per_capita']:,.0f}")

print("\nTop 10 Highest Cost of Living:")
top_cost = df_analysis.nlargest(10, 'cost_of_living')[['country', 'cost_of_living']]
for idx, row in top_cost.iterrows():
    print(f"  {row['country']}: {row['cost_of_living']:.1f}")

print("\nTop 10 Most Corrupt Countries:")
most_corrupt = df_analysis.nlargest(10, 'corruption_index')[['country', 'corruption_index']]
for idx, row in most_corrupt.iterrows():
    print(f"  {row['country']}: {row['corruption_index']:.0f}")

print("\nTop 10 Least Corrupt Countries:")
least_corrupt = df_analysis.nsmallest(10, 'corruption_index')[['country', 'corruption_index']]
for idx, row in least_corrupt.iterrows():
    print(f"  {row['country']}: {row['corruption_index']:.0f}")

# Summary table of all countries (first 20)
print(f"\nSample of All Countries Data:")
print("=" * 30)
sample_data = df_analysis[['country', 'gdp_per_capita', 'cost_of_living', 'corruption_index']].head(20)
print(sample_data.to_string(index=False))

Interactive Plots with All Country Names:



Strip Plots - All Countries Visible:



Countries with Extreme Values:

Top 10 Highest GDP per Capita:
  Luxembourg: $134,754
  Singapore: $116,486
  Ireland: $106,456
  Qatar: $93,521
  Bermuda: $85,192
  Norway: $79,201
  Switzerland: $77,324
  Macao: $73,802
  United States: $69,288
  Brunei: $66,620

Top 10 Lowest GDP per Capita:
  Oman: $30,422
  Turkey: $30,472
  Greece: $31,295
  Cyprus: $31,509
  Panama: $31,680
  Slovakia: $33,010
  Russia: $33,361
  Croatia: $33,801
  Bahamas: $34,108
  Latvia: $34,469

Top 10 Highest Cost of Living:
  Bermuda: 157.6
  Switzerland: 142.4
  Cayman Islands: 137.9
  Israel: 130.2
  Iceland: 128.0
  New Caledonia: 125.8
  Norway: 124.6
  Turks and Caicos Islands: 124.6
  Barbados: 121.5
  Denmark: 119.9

Top 10 Most Corrupt Countries:
  South Sudan: 89
  Somalia: 87
  Syria: 87
  Venezuela: 86
  Afghanistan: 84
  Yemen: 84
  Equatorial Guinea: 83
  Libya: 83
  Burundi: 81
  Congo (Dem. Republic): 81

Top 10 Least Corrupt Countries:
  Denmark: 12
  Finland: 12
  New Zealand: 12
  Norwa

In [6]:
# Cell 6
fig_scatter = px.scatter_matrix(df_analysis, 
                               dimensions=numerical_cols,
                               hover_data=['country'],
                               title="Economic Indicators - Interactive Scatter Plot Matrix",
                               height=800)

fig_scatter.update_traces(diagonal_visible=False, showupperhalf=False)
fig_scatter.show()

def show_top_bottom_countries(df, column, n=5):
    print(f"\n{column.upper().replace('_', ' ')}:")
    print(f"TOP {n} Countries:")
    top_countries = df.nlargest(n, column)[['country', column]]
    for idx, row in top_countries.iterrows():
        print(f"   {row['country']}: {row[column]:.2f}")
    
    print(f"\nBOTTOM {n} Countries:")
    bottom_countries = df.nsmallest(n, column)[['country', column]]
    for idx, row in bottom_countries.iterrows():
        print(f"   {row['country']}: {row[column]:.2f}")

for col in numerical_cols:
    show_top_bottom_countries(df_analysis, col)

fig1 = px.scatter(df_analysis, x='gdp_per_capita', y='corruption_index',
                 hover_data=['country', 'cost_of_living', 'unemployment_rate'],
                 title='GDP per Capita vs Corruption Index',
                 labels={'gdp_per_capita': 'GDP per Capita ($)', 
                        'corruption_index': 'Corruption Index'})
fig1.show()

fig2 = px.scatter(df_analysis, x='gdp_per_capita', y='cost_of_living',
                 hover_data=['country', 'corruption_index', 'unemployment_rate'],
                 title='GDP per Capita vs Cost of Living',
                 labels={'gdp_per_capita': 'GDP per Capita ($)', 
                        'cost_of_living': 'Cost of Living Index'})
fig2.show()

fig3 = px.scatter(df_analysis, x='tourism_millions', y='gdp_per_capita',
                 hover_data=['country', 'corruption_index', 'cost_of_living'],
                 title='Tourism vs GDP per Capita',
                 labels={'tourism_millions': 'Tourism (Millions)', 
                        'gdp_per_capita': 'GDP per Capita ($)'})
fig3.show()

fig4 = px.scatter(df_analysis, x='unemployment_rate', y='gdp_per_capita',
                 hover_data=['country', 'corruption_index', 'cost_of_living'],
                 title='Unemployment Rate vs GDP per Capita',
                 labels={'unemployment_rate': 'Unemployment Rate (%)', 
                        'gdp_per_capita': 'GDP per Capita ($)'})
fig4.show()



UNEMPLOYMENT RATE:
TOP 5 Countries:
   Marshall Islands: 36.00
   South Africa: 33.60
   Kiribati: 30.60
   Kosovo: 30.50
   American Samoa: 29.80

BOTTOM 5 Countries:
   Cocos (Keeling) Islands: 0.10
   Qatar: 0.30
   Cambodia: 0.60
   Niger: 0.80
   Gibraltar: 1.00

TOURISM MILLIONS:
TOP 5 Countries:
   France: 117.10
   Mexico: 51.10
   United States: 45.00
   Italy: 38.40
   Hungary: 31.60

BOTTOM 5 Countries:
   Moldova: 0.00
   Bermuda: 0.10
   Serbia: 0.40
   Ethiopia: 0.50
   Luxembourg: 0.50

GDP PER CAPITA:
TOP 5 Countries:
   Luxembourg: 134754.00
   Singapore: 116486.00
   Ireland: 106456.00
   Qatar: 93521.00
   Bermuda: 85192.00

BOTTOM 5 Countries:
   Oman: 30422.00
   Turkey: 30472.00
   Greece: 31295.00
   Cyprus: 31509.00
   Panama: 31680.00

COST OF LIVING:
TOP 5 Countries:
   Bermuda: 157.60
   Switzerland: 142.40
   Cayman Islands: 137.90
   Israel: 130.20
   Iceland: 128.00

BOTTOM 5 Countries:
   Tajikistan: 22.80
   Angola: 23.00
   Kyrgyzstan: 25.30
   Pakista

In [7]:
# Cell 7
print("\n" + "="*50)
print("LINEAR REGRESSION ANALYSIS")
print("="*50)

def simple_linear_regression(X, y):
    X_with_bias = np.column_stack([np.ones(len(X)), X])
    try:
        theta = np.linalg.inv(X_with_bias.T @ X_with_bias) @ X_with_bias.T @ y
        return theta
    except np.linalg.LinAlgError:
        theta = np.linalg.pinv(X_with_bias.T @ X_with_bias) @ X_with_bias.T @ y
        return theta

def predict_linear(X, theta):
    X_with_bias = np.column_stack([np.ones(len(X)), X])
    return X_with_bias @ theta

def calculate_r2(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / ss_tot)

def calculate_mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

print("Linear regression functions defined successfully")


LINEAR REGRESSION ANALYSIS
Linear regression functions defined successfully


In [8]:
# Cell 8
X_features = ['cost_of_living', 'corruption_index', 'unemployment_rate', 'tourism_millions']
X = df_analysis[X_features].values
y = df_analysis['gdp_per_capita'].values

np.random.seed(42)
n_samples = len(X)
train_indices = np.random.choice(n_samples, size=int(0.8 * n_samples), replace=False)
test_indices = np.setdiff1d(np.arange(n_samples), train_indices)

X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

theta = simple_linear_regression(X_train, y_train)

y_pred_train = predict_linear(X_train, theta)
y_pred_test = predict_linear(X_test, theta)

mse_train = calculate_mse(y_train, y_pred_train)
mse_test = calculate_mse(y_test, y_pred_test)
r2_train = calculate_r2(y_train, y_pred_train)
r2_test = calculate_r2(y_test, y_pred_test)

print(f"Linear Regression Results:")
print(f"Train MSE: {mse_train:.2f}")
print(f"Test MSE: {mse_test:.2f}")
print(f"Train R²: {r2_train:.4f}")
print(f"Test R²: {r2_test:.4f}")

feature_importance = pd.DataFrame({
    'Feature': X_features,
    'Coefficient': theta[1:],
    'Abs_Coefficient': np.abs(theta[1:])
}).sort_values('Abs_Coefficient', ascending=False)

print(f"\nFeature Importance:")
print(feature_importance)


Linear Regression Results:
Train MSE: 85210563.72
Test MSE: 252386738.37
Train R²: 0.2538
Test R²: 0.2499

Feature Importance:
             Feature  Coefficient  Abs_Coefficient
1   corruption_index  -214.154117       214.154117
2  unemployment_rate  -101.801451       101.801451
3   tourism_millions   -84.265791        84.265791
0     cost_of_living    68.318609        68.318609


In [9]:
# Cell 9
y_pred_all = predict_linear(X, theta)
r2_all = calculate_r2(y, y_pred_all)

fig_all = px.scatter(x=y, y=y_pred_all, 
                    title=f'Linear Regression: All Countries (R² = {r2_all:.4f})',
                    labels={'x': 'Actual GDP per Capita', 'y': 'Predicted GDP per Capita'})
fig_all.add_trace(go.Scatter(x=[y.min(), y.max()], 
                            y=[y.min(), y.max()], 
                            mode='lines', name='Perfect Prediction', 
                            line=dict(dash='dash', color='red')))
fig_all.show()

fig_combined = go.Figure()

fig_combined.add_trace(go.Scatter(
    x=y_train, y=y_pred_train,
    mode='markers',
    name=f'Training Set (n={len(y_train)})',
    marker=dict(color='blue', size=6)
))

fig_combined.add_trace(go.Scatter(
    x=y_test, y=y_pred_test,
    mode='markers',
    name=f'Test Set (n={len(y_test)})',
    marker=dict(color='red', size=8)
))

fig_combined.add_trace(go.Scatter(
    x=[y.min(), y.max()], 
    y=[y.min(), y.max()],
    mode='lines',
    name='Perfect Prediction',
    line=dict(dash='dash', color='black')
))

fig_combined.update_layout(
    title=f'Linear Regression: Training vs Test Performance (Test R² = {r2_test:.4f})',
    xaxis_title='Actual GDP per Capita',
    yaxis_title='Predicted GDP per Capita'
)
fig_combined.show()

results_df = df_analysis.copy()
results_df['Predicted_GDP'] = y_pred_all
results_df['Prediction_Error'] = np.abs(y - y_pred_all)

fig_interactive = px.scatter(results_df, 
                           x='gdp_per_capita', 
                           y='Predicted_GDP',
                           hover_data=['country', 'cost_of_living', 'corruption_index'],
                           title=f'Interactive: All Countries (R² = {r2_all:.4f})',
                           labels={'gdp_per_capita': 'Actual GDP per Capita', 
                                  'Predicted_GDP': 'Predicted GDP per Capita'})
fig_interactive.add_trace(go.Scatter(x=[y.min(), y.max()], 
                                   y=[y.min(), y.max()], 
                                   mode='lines', name='Perfect Prediction', 
                                   line=dict(dash='dash', color='red')))
fig_interactive.show()

fig_features = px.bar(feature_importance, x='Feature', y='Abs_Coefficient',
                     title='Linear Regression - Feature Importance')
fig_features.show()


In [10]:
# Cell 10
print("\n" + "="*50)
print("CLUSTERING ANALYSIS")
print("="*50)

def standardize_data(X):
    return (X - np.mean(X, axis=0)) / np.std(X, axis=0)

def kmeans_simple(X, k, max_iters=100):
    n_samples, n_features = X.shape
    
    np.random.seed(42)
    centroids = X[np.random.choice(n_samples, k, replace=False)]
    
    for _ in range(max_iters):
        distances = cdist(X, centroids)
        labels = np.argmin(distances, axis=1)
        
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
        
        if np.allclose(centroids, new_centroids):
            break
        centroids = new_centroids
    
    return labels, centroids

def calculate_inertia(X, labels, centroids):
    inertia = 0
    for i in range(len(centroids)):
        cluster_points = X[labels == i]
        if len(cluster_points) > 0:
            inertia += np.sum((cluster_points - centroids[i]) ** 2)
    return inertia


CLUSTERING ANALYSIS


In [11]:
# Cell 11
X_cluster = df_analysis[numerical_cols].values
X_scaled = standardize_data(X_cluster)

inertias = []
K_range = range(1, 11)

for k in K_range:
    if k == 1:
        inertias.append(np.sum((X_scaled - np.mean(X_scaled, axis=0)) ** 2))
    else:
        labels, centroids = kmeans_simple(X_scaled, k)
        inertia = calculate_inertia(X_scaled, labels, centroids)
        inertias.append(inertia)

fig_elbow = px.line(x=list(K_range), y=inertias, 
                    title='Elbow Method for Optimal Number of Clusters',
                    labels={'x': 'Number of Clusters (k)', 'y': 'Inertia'})
fig_elbow.add_scatter(x=list(K_range), y=inertias, mode='markers', marker=dict(size=8))
fig_elbow.show()

In [12]:
# Cell 12
optimal_k = 4
cluster_labels, centroids = kmeans_simple(X_scaled, optimal_k)

df_clustered = df_analysis.copy()
df_clustered['Cluster'] = cluster_labels

print(f"K-means clustering completed with {optimal_k} clusters")
print(f"Cluster distribution:")
print(pd.Series(cluster_labels).value_counts().sort_index())

print("\nCLUSTER CHARACTERISTICS")
print("="*30)

cluster_summary = df_clustered.groupby('Cluster')[numerical_cols].mean()
print(cluster_summary)

K-means clustering completed with 4 clusters
Cluster distribution:
0      4
1     30
2    126
3      7
Name: count, dtype: int64

CLUSTER CHARACTERISTICS
         unemployment_rate  tourism_millions  gdp_per_capita  cost_of_living  \
Cluster                                                                        
0                 6.950000         62.900000    53317.250000       84.275000   
1                 5.246667          4.326667    63056.733333      107.933333   
2                 4.887302          4.494444    46065.063492       45.394444   
3                30.671429          4.071429    47316.000000       47.514286   

         corruption_index  
Cluster                    
0               43.750000  
1               31.400000  
2               70.301587  
3               67.142857  


In [13]:
# Cell 13
fig_cluster = px.scatter(df_clustered, x='gdp_per_capita', y='corruption_index',
                        color='Cluster', 
                        hover_data=['country', 'cost_of_living', 'unemployment_rate'],
                        title='Country Clusters: GDP per Capita vs Corruption Index',
                        labels={'gdp_per_capita': 'GDP per Capita', 
                               'corruption_index': 'Corruption Index'})
fig_cluster.show()

fig_cluster2 = px.scatter(df_clustered, x='tourism_millions', y='cost_of_living',
                         color='Cluster', 
                         hover_data=['country', 'gdp_per_capita', 'corruption_index'],
                         title='Country Clusters: Tourism vs Cost of Living',
                         labels={'tourism_millions': 'Tourism (Millions)', 
                                'cost_of_living': 'Cost of Living Index'})
fig_cluster2.show()


In [14]:
# Cell 14
print("\n" + "="*50)
print("CLASSIFICATION ANALYSIS")
print("="*50)

def simple_decision_tree(X, y, test_size=0.2):
    np.random.seed(42)
    n_samples = len(X)
    test_indices = np.random.choice(n_samples, size=int(test_size * n_samples), replace=False)
    train_indices = np.setdiff1d(np.arange(n_samples), test_indices)
    
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    best_accuracy = 0
    best_feature = 0
    best_threshold = 0
    
    for feature_idx in range(X.shape[1]):
        feature_values = X_train[:, feature_idx]
        thresholds = np.unique(feature_values)
        
        for threshold in thresholds:
            predictions = (feature_values > threshold).astype(int)
            accuracy = np.mean(predictions == y_train)
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_feature = feature_idx
                best_threshold = threshold
    
    test_predictions = (X_test[:, best_feature] > best_threshold).astype(int)
    test_accuracy = np.mean(test_predictions == y_test)
    
    return {
        'best_feature': best_feature,
        'best_threshold': best_threshold,
        'train_accuracy': best_accuracy,
        'test_accuracy': test_accuracy,
        'y_test': y_test,
        'y_pred': test_predictions
    }


CLASSIFICATION ANALYSIS


In [15]:
# Cell 15
gdp_median = df_analysis['gdp_per_capita'].median()
df_analysis['GDP_Category'] = (df_analysis['gdp_per_capita'] > gdp_median).astype(int)

print(f"GDP per capita median: {gdp_median:.2f}")
print(f"High GDP countries: {df_analysis['GDP_Category'].sum()}")
print(f"Low GDP countries: {len(df_analysis) - df_analysis['GDP_Category'].sum()}")

X_class_features = ['cost_of_living', 'corruption_index', 'unemployment_rate', 'tourism_millions']
X_class = df_analysis[X_class_features].values
y_class = df_analysis['GDP_Category'].values

results = simple_decision_tree(X_class, y_class)

print(f"\nSimple Decision Tree Results:")
print(f"Best feature: {X_class_features[results['best_feature']]}")
print(f"Best threshold: {results['best_threshold']:.2f}")
print(f"Train accuracy: {results['train_accuracy']:.4f}")
print(f"Test accuracy: {results['test_accuracy']:.4f}")



GDP per capita median: 47316.00
High GDP countries: 25
Low GDP countries: 142

Simple Decision Tree Results:
Best feature: cost_of_living
Best threshold: 74.70
Train accuracy: 0.9254
Test accuracy: 0.8788


In [16]:
# Cell 16
feature_importance_class = []
for i, feature in enumerate(X_class_features):
    correlation = np.corrcoef(X_class[:, i], y_class)[0, 1]
    feature_importance_class.append(abs(correlation))

feature_importance_df = pd.DataFrame({
    'Feature': X_class_features,
    'Importance': feature_importance_class
}).sort_values('Importance', ascending=False)

print(f"Feature Importance:")
print(feature_importance_df)

y_test, y_pred = results['y_test'], results['y_pred']
tn = np.sum((y_test == 0) & (y_pred == 0))
fp = np.sum((y_test == 0) & (y_pred == 1))
fn = np.sum((y_test == 1) & (y_pred == 0))
tp = np.sum((y_test == 1) & (y_pred == 1))

print(f"\nConfusion Matrix:")
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")
print(f"True Positives: {tp}")

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"\nClassification Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")

Feature Importance:
             Feature  Importance
1   corruption_index    0.721645
0     cost_of_living    0.643951
3   tourism_millions    0.183730
2  unemployment_rate    0.065116

Confusion Matrix:
True Negatives: 25
False Positives: 4
False Negatives: 0
True Positives: 4

Classification Metrics:
Precision: 0.5000
Recall: 1.0000
F1-Score: 0.6667


In [17]:
# Cell 17
fig_class_importance = px.bar(feature_importance_df, x='Feature', y='Importance',
                             title='Classification - Feature Importance')
fig_class_importance.show()

fig_classification = px.scatter(df_analysis, x='gdp_per_capita', y=X_class_features[results['best_feature']],
                               color='GDP_Category',
                               title=f'Classification: GDP Category vs {X_class_features[results["best_feature"]].title()}',
                               labels={'GDP_Category': 'GDP Category (0=Low, 1=High)'})
fig_classification.add_hline(y=results['best_threshold'], line_dash="dash", 
                           annotation_text=f"Decision Threshold: {results['best_threshold']:.2f}")
fig_classification.show()

# Cell 18
print("\n" + "="*50)
print("KEY INSIGHTS AND RECOMMENDATIONS")
print("="*50)

print("LINEAR REGRESSION INSIGHTS:")
print(f"The model explains {r2_test:.1%} of the variance in GDP per capita")
print(f"Most important factors for GDP prediction:")
for i, row in feature_importance.head(3).iterrows():
    print(f"  - {row['Feature']}: {row['Coefficient']:.2f}")

print(f"\nCLUSTERING INSIGHTS:")
print(f"Countries naturally group into {optimal_k} distinct economic clusters")
print(f"Cluster characteristics suggest different economic development stages")

print(f"\nCLASSIFICATION INSIGHTS:")
print(f"Can predict high vs low GDP countries with {results['test_accuracy']:.1%} accuracy")
print(f"Best predictor: {X_class_features[results['best_feature']]}")
print(f"Most important factors for GDP classification:")
for i, row in feature_importance_df.head(3).iterrows():
    print(f"  - {row['Feature']}: {row['Importance']:.3f}")

print(f"\nRECOMMENDATIONS:")
print("Focus on reducing corruption index to improve economic outcomes")
print("Consider tourism development as a GDP growth strategy")
print("Monitor cost of living relative to income levels")
print("Address unemployment through targeted economic policies")

print(f"\nNEXT STEPS:")
print("Collect more recent data for time series analysis")
print("Include additional economic indicators")
print("Develop predictive models for future economic trends")
print("Create country-specific policy recommendations")

print("\n" + "="*50)
print("ANALYSIS COMPLETE!")
print("="*50)


KEY INSIGHTS AND RECOMMENDATIONS
LINEAR REGRESSION INSIGHTS:
The model explains 25.0% of the variance in GDP per capita
Most important factors for GDP prediction:
  - corruption_index: -214.15
  - unemployment_rate: -101.80
  - tourism_millions: -84.27

CLUSTERING INSIGHTS:
Countries naturally group into 4 distinct economic clusters
Cluster characteristics suggest different economic development stages

CLASSIFICATION INSIGHTS:
Can predict high vs low GDP countries with 87.9% accuracy
Best predictor: cost_of_living
Most important factors for GDP classification:
  - corruption_index: 0.722
  - cost_of_living: 0.644
  - tourism_millions: 0.184

RECOMMENDATIONS:
Focus on reducing corruption index to improve economic outcomes
Consider tourism development as a GDP growth strategy
Monitor cost of living relative to income levels
Address unemployment through targeted economic policies

NEXT STEPS:
Collect more recent data for time series analysis
Include additional economic indicators
Develop

In [18]:
# Visualize feature importance
fig_class_importance = px.bar(feature_importance_df, x='Feature', y='Importance',
                             title='Classification - Feature Importance (Correlation)')
fig_class_importance.show()

# Visualize classification results
fig_classification = px.scatter(df_analysis, x='gdp_per_capita', y=X_class_features[results['best_feature']],
                               color='GDP_Category',
                               title=f'Classification: GDP Category vs {X_class_features[results["best_feature"]].title()}',
                               labels={'GDP_Category': 'GDP Category (0=Low, 1=High)'})
fig_classification.add_hline(y=results['best_threshold'], line_dash="dash", 
                           annotation_text=f"Decision Threshold: {results['best_threshold']:.2f}")
fig_classification.show()