In [49]:
import pandas as pd
import numpy as np
import duckdb
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing

In [50]:
combined_df = pd.read_csv('combined_data.csv')

In [51]:
combined_df.head()

Unnamed: 0,Country,Year,GDP per Capita,Population(Million),Average Spending on Higher Education (USD/student),Government Expenditure On Education (%),Government Spending Teritary (% GDP),Household Income per Capita,Number of Universities,Primary Enrollment rate (% gross),Secondary Enrollment rate (% gross),Tertiary Enrollment rate (% gross),Population with Tertiary Education (%)
0,AUS,1995,22442.32,18.004882,,5.13413,1.4,,187,101.29127,143.23387,70.68857,24.697142
1,AUS,1996,23289.4,18.224767,,5.231,1.6,,187,101.58029,148.636,75.64778,
2,AUS,1997,24477.41,18.423037,,,,,187,101.18403,152.93597,80.90665,25.702168
3,AUS,1998,25708.26,18.607584,,,,,187,100.92862,,,28.139578
4,AUS,1999,27139.25,18.812264,,,,,187,100.65884,,,28.981586


In [52]:
query = """
        SELECT 
            Country,
            AVG("GDP per capita") AS "GDP per capita",
            AVG("Population(Million)") AS "Population (Million)",
            AVG("Average Spending on Higher Education (USD/student)")
            AS "Average Spending on Higher Education (USD/student)",
            AVG("Government Expenditure On Education (%)")
            AS "Government Expenditure On Education (%)",
            AVG("Government Spending Teritary (% GDP)")
            AS "Government Spending Teritary (% GDP)",
            AVG("Household Income per Capita")
            AS "Household Income per Capita",
            AVG("Number of Universities")
            AS "Number of Universities",
            AVG("Primary Enrollment rate (% gross)")
            AS "Primary Enrollment rate (% gross)",
            AVG("Secondary Enrollment rate (% gross)")
            AS "Secondary Enrollment rate (% gross)",
            AVG("Tertiary Enrollment rate (% gross)")
            AS "Tertiary Enrollment rate (% gross)",
            AVG("Population with Tertiary Education (%)")
            AS "Population with Tertiary Education (%)"
        FROM combined_df
        GROUP BY Country
        
        """
df = duckdb.sql(query).df()
df.head()

Unnamed: 0,Country,GDP per capita,Population (Million),Average Spending on Higher Education (USD/student),Government Expenditure On Education (%),Government Spending Teritary (% GDP),Household Income per Capita,Number of Universities,Primary Enrollment rate (% gross),Secondary Enrollment rate (% gross),Tertiary Enrollment rate (% gross),Population with Tertiary Education (%)
0,AUS,38143.8412,21.22152,17397.198571,5.063198,1.24375,35349.783486,187.0,102.619088,147.339639,100.465227,40.367922
1,AUT,39269.2264,8.311855,18464.9425,5.47438,1.5,33898.52129,84.0,101.832554,100.166013,70.926428,35.767432
2,BEL,36588.5528,10.709564,17163.086154,5.935744,1.275,31524.486336,142.0,102.60156,155.965544,65.986742,41.729849
3,CAN,37217.9484,33.088363,22755.0575,5.031936,1.733333,30849.148394,383.0,100.06532,106.513946,67.084939,53.696534
4,CHE,49494.9216,7.678109,24848.25,4.900874,1.245455,38082.094754,103.0,103.458437,97.630051,48.29867,35.897165


In [53]:
df

Unnamed: 0,Country,GDP per capita,Population (Million),Average Spending on Higher Education (USD/student),Government Expenditure On Education (%),Government Spending Teritary (% GDP),Household Income per Capita,Number of Universities,Primary Enrollment rate (% gross),Secondary Enrollment rate (% gross),Tertiary Enrollment rate (% gross),Population with Tertiary Education (%)
0,AUS,38143.8412,21.22152,17397.198571,5.063198,1.24375,35349.783486,187.0,102.619088,147.339639,100.465227,40.367922
1,AUT,39269.2264,8.311855,18464.9425,5.47438,1.5,33898.52129,84.0,101.832554,100.166013,70.926428,35.767432
2,BEL,36588.5528,10.709564,17163.086154,5.935744,1.275,31524.486336,142.0,102.60156,155.965544,65.986742,41.729849
3,CAN,37217.9484,33.088363,22755.0575,5.031936,1.733333,30849.148394,383.0,100.06532,106.513946,67.084939,53.696534
4,CHE,49494.9216,7.678109,24848.25,4.900874,1.245455,38082.094754,103.0,103.458437,97.630051,48.29867,35.897165
5,CHL,15891.7824,16.581038,7544.6592,4.042522,0.771429,14680.116641,130.0,101.931867,93.304423,58.761617,27.00079
6,COL,10366.1188,42.574965,5183.9889,4.300569,0.870588,,299.0,118.238453,87.256054,36.675314,26.549254
7,CRI,12656.1856,4.313503,8235.508,5.670795,1.22381,14398.927904,68.0,111.558533,91.478651,49.80273,23.838134
8,CZE,25498.4176,10.397214,10111.913067,4.152662,0.840909,21668.392262,64.0,101.572463,96.110753,48.297053,20.194433
9,DEU,37336.2512,81.991879,17381.116667,4.74316,1.213333,34478.737746,461.0,103.14862,101.347733,61.839059,25.446672


In [54]:
df.to_csv('country_average.csv', index=False)

In [55]:
df = df[(df['Country'] != "COL") & (df['Country'] != "ISL") & (df['Country'] != "ISR")]

In [56]:
df.reset_index(drop=True, inplace=True)

In [57]:
df

Unnamed: 0,Country,GDP per capita,Population (Million),Average Spending on Higher Education (USD/student),Government Expenditure On Education (%),Government Spending Teritary (% GDP),Household Income per Capita,Number of Universities,Primary Enrollment rate (% gross),Secondary Enrollment rate (% gross),Tertiary Enrollment rate (% gross),Population with Tertiary Education (%)
0,AUS,38143.8412,21.22152,17397.198571,5.063198,1.24375,35349.783486,187.0,102.619088,147.339639,100.465227,40.367922
1,AUT,39269.2264,8.311855,18464.9425,5.47438,1.5,33898.52129,84.0,101.832554,100.166013,70.926428,35.767432
2,BEL,36588.5528,10.709564,17163.086154,5.935744,1.275,31524.486336,142.0,102.60156,155.965544,65.986742,41.729849
3,CAN,37217.9484,33.088363,22755.0575,5.031936,1.733333,30849.148394,383.0,100.06532,106.513946,67.084939,53.696534
4,CHE,49494.9216,7.678109,24848.25,4.900874,1.245455,38082.094754,103.0,103.458437,97.630051,48.29867,35.897165
5,CHL,15891.7824,16.581038,7544.6592,4.042522,0.771429,14680.116641,130.0,101.931867,93.304423,58.761617,27.00079
6,CRI,12656.1856,4.313503,8235.508,5.670795,1.22381,14398.927904,68.0,111.558533,91.478651,49.80273,23.838134
7,CZE,25498.4176,10.397214,10111.913067,4.152662,0.840909,21668.392262,64.0,101.572463,96.110753,48.297053,20.194433
8,DEU,37336.2512,81.991879,17381.116667,4.74316,1.213333,34478.737746,461.0,103.14862,101.347733,61.839059,25.446672
9,DNK,38765.7204,5.491185,17796.165769,7.830206,2.28,29396.827843,81.0,100.77911,124.672983,71.262708,37.325815


In [58]:
input_vars = ["GDP per capita", "Population (Million)", "Average Spending on Higher Education (USD/student)", \
              "Government Expenditure On Education (%)", "Government Spending Teritary (% GDP)",\
              "Household Income per Capita", "Number of Universities", "Primary Enrollment rate (% gross)", \
              "Secondary Enrollment rate (% gross)", "Tertiary Enrollment rate (% gross)"]
def Normalizer(df_cols):
    scaler = preprocessing.StandardScaler().fit(df_cols)
    return(scaler.transform(df_cols))
x = Normalizer(df[input_vars].values)
y = df["Population with Tertiary Education (%)"]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=2950)
model = LinearRegression().fit(X_train, y_train)

In [59]:
y_test_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)

print("Mean Squared Error on Training Data: " + str(mse_train))
print("Mean Squared Error on Testing data: " + str(mse_test))
print("Root Mean Squared Error on Training Data: " + str(rmse_train))
print("Root Mean Squared Error on Testing Data: " + str(rmse_test))

Mean Squared Error on Training Data: 30.336330076317726
Mean Squared Error on Testing data: 102.69656515821106
Root Mean Squared Error on Training Data: 5.507842597271433
Root Mean Squared Error on Testing Data: 10.133931377220346


In [60]:
x = Normalizer(df["GDP per capita"].values.reshape(-1, 1))
y = df["Population with Tertiary Education (%)"].values.reshape(-1, 1)
model = LinearRegression().fit(x, y)
print(model.coef_)

[[5.38228627]]
