# Country Averages DataFrame

In [9]:
import pandas as pd
import numpy as np
import duckdb
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing
import statsmodels.api as sm

We created this data frame in order to deal with the missing values in our data set. This data frame contains the average value of each variable aggregated across all years for each country.

In [10]:
combined_df = pd.read_csv('combined_data.csv')

In [11]:
combined_df.head()

Unnamed: 0,Country,Year,GDP per Capita,Population(Million),Average Spending on Higher Education (USD/student),Government Expenditure On Education (%),Government Spending Teritary (% Gov Spending),Household Income per Capita,Number of Universities,Primary Enrollment rate (% gross),Secondary Enrollment rate (% gross),Tertiary Enrollment rate (% gross),Population with Tertiary Education (%)
0,AUS,1995,22442.32,18.004882,,5.13413,1.4,,187,101.29127,143.23387,70.68857,24.697142
1,AUS,1996,23289.4,18.224767,,5.231,1.6,,187,101.58029,148.636,75.64778,
2,AUS,1997,24477.41,18.423037,,,,,187,101.18403,152.93597,80.90665,25.702168
3,AUS,1998,25708.26,18.607584,,,,,187,100.92862,,,28.139578
4,AUS,1999,27139.25,18.812264,,,,,187,100.65884,,,28.981586


In [12]:
combined_df.dtypes

Country                                                object
Year                                                    int64
GDP per Capita                                        float64
Population(Million)                                   float64
Average Spending on Higher Education (USD/student)    float64
Government Expenditure On Education (%)               float64
Government Spending Teritary (% Gov Spending)         float64
Household Income per Capita                           float64
Number of Universities                                  int64
Primary Enrollment rate (% gross)                     float64
Secondary Enrollment rate (% gross)                   float64
Tertiary Enrollment rate (% gross)                    float64
Population with Tertiary Education (%)                float64
dtype: object

Because the types are all different, we need to convert the Government Spending Teritary (% Gov Spending) column to a numeric quantity.

In [13]:
gov_spending = combined_df["Government Spending Teritary (% Gov Spending)"]
gov_spending = pd.to_numeric(gov_spending, errors = "coerce")

In [14]:
combined_df.dtypes

Country                                                object
Year                                                    int64
GDP per Capita                                        float64
Population(Million)                                   float64
Average Spending on Higher Education (USD/student)    float64
Government Expenditure On Education (%)               float64
Government Spending Teritary (% Gov Spending)         float64
Household Income per Capita                           float64
Number of Universities                                  int64
Primary Enrollment rate (% gross)                     float64
Secondary Enrollment rate (% gross)                   float64
Tertiary Enrollment rate (% gross)                    float64
Population with Tertiary Education (%)                float64
dtype: object

Now that all the dtypes are all numeric quanitities (either floats or integers), we can now create the data frame.

In order to create it, we will select all the columns in the combined_df and calculate the average for each column by each Country. 

In [15]:
query = """
        SELECT 
            Country,
            AVG("GDP per capita") AS "GDP per Capita",
            AVG("Population(Million)") AS "Population (Million)",
            AVG("Average Spending on Higher Education (USD/student)")
            AS "Average Spending on Higher Education (USD/student)",
            AVG("Government Expenditure On Education (%)")
            AS "Government Expenditure On Education (%)",
            AVG("Government Spending Teritary (% Gov Spending)")
            AS "Government Spending Teritary (% Gov Spending)",
            AVG("Household Income per Capita")
            AS "Household Income per Capita",
            AVG("Number of Universities")
            AS "Number of Universities",
            AVG("Primary Enrollment rate (% gross)")
            AS "Primary Enrollment rate (% gross)",
            AVG("Secondary Enrollment rate (% gross)")
            AS "Secondary Enrollment rate (% gross)",
            AVG("Tertiary Enrollment rate (% gross)")
            AS "Tertiary Enrollment rate (% gross)",
            AVG("Population with Tertiary Education (%)")
            AS "Population with Tertiary Education (%)"
        FROM combined_df
        GROUP BY Country
        
        """
df = duckdb.sql(query).df()
df.head()

Unnamed: 0,Country,GDP per Capita,Population (Million),Average Spending on Higher Education (USD/student),Government Expenditure On Education (%),Government Spending Teritary (% Gov Spending),Household Income per Capita,Number of Universities,Primary Enrollment rate (% gross),Secondary Enrollment rate (% gross),Tertiary Enrollment rate (% gross),Population with Tertiary Education (%)
0,AUS,38143.8412,21.22152,17397.198571,5.063198,1.24375,35349.783486,187.0,102.619088,147.339639,100.465227,40.367922
1,AUT,39269.2264,8.311855,18464.9425,5.47438,1.5,33898.52129,84.0,101.832554,100.166013,70.926428,35.767432
2,BEL,36588.5528,10.709564,17163.086154,5.935744,1.275,31524.486336,142.0,102.60156,155.965544,65.986742,41.729849
3,CAN,37217.9484,33.088363,22755.0575,5.031936,1.733333,30849.148394,383.0,100.06532,106.513946,67.084939,53.696534
4,CHE,49494.9216,7.678109,24848.25,4.900874,1.245455,38082.094754,103.0,103.458437,97.630051,48.29867,35.897165


In [16]:
df[df.isna().any(axis=1)]

Unnamed: 0,Country,GDP per Capita,Population (Million),Average Spending on Higher Education (USD/student),Government Expenditure On Education (%),Government Spending Teritary (% Gov Spending),Household Income per Capita,Number of Universities,Primary Enrollment rate (% gross),Secondary Enrollment rate (% gross),Tertiary Enrollment rate (% gross),Population with Tertiary Education (%)


Because, there is no data that shows up in the data frame when we check for Nan values, we now know that df does not contain any nans.

In [17]:
df.to_csv('country_average.csv', index=False)