In [1]:
import os
os.chdir('/Users/robbyjeffries/MSEA2022/Spring 2022/ECON 5763, Economic Analytics/Data')


In [2]:
import numpy as np 
import pandas as pd
import math

In [3]:
raw0 = pd.read_csv('College.csv')

In [4]:
raw0.head()

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Abilene Christian University,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Alaska Pacific University,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [5]:
# Convert "private" variable to a dummy using a built-in function
raw0['Private']=pd.get_dummies(raw0['Private'],drop_first=True)

In [6]:
raw0.Private

0      1
1      1
2      1
3      1
4      1
      ..
772    0
773    1
774    1
775    1
776    1
Name: Private, Length: 777, dtype: uint8

In [7]:
# Change the column name perc.alumni
raw0.rename(columns = {'perc.alumni':'palumni'}, inplace = True)

In [8]:
raw0.head()

Unnamed: 0.1,Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,palumni,Expend,Grad.Rate
0,Abilene Christian University,1,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Adelphi University,1,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Adrian College,1,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Agnes Scott College,1,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Alaska Pacific University,1,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


### <font color='green'> Plotting Library: matplotlib.pyplot

* matplotlib.pyplot is a collection of functions that make matplotlib work like MATLAB. Each pyplot function makes some change to a figure: e.g., creates a figure, creates a plotting area in a figure, plots some lines in a plotting area, decorates the plot with labels, etc.
    
    1. Introduction: https://matplotlib.org/tutorials/index.html

    2. Useful examples and codes: https://matplotlib.org/gallery/index.html
    
    3. Style reference: https://matplotlib.org/3.2.1/gallery/style_sheets/style_sheets_reference.html

In [None]:
# Simple scatter plot
import matplotlib.pyplot as plt

plt.style.use('ggplot')
plt.figure(figsize=(8, 6), dpi=100)
plt.scatter('Top10perc', 'palumni', data=raw0)
plt.xlabel('Top10perc')
plt.ylabel('palumni')
plt.show()

### <font color='green'> Running OLS usig "statsmodels"

* statsmodels.formula.api: a interface for specifying models using formula strings and DataFrames. 
  (API reference: https://www.statsmodels.org/stable/api.html) 

* Useful examples and codes: https://www.statsmodels.org/stable/examples/index.html

In [None]:
# Import statsmodels.formula.api
import statsmodels.formula.api as smf

# Fit a regression model
OLSres = smf.ols('palumni ~ Top10perc + Outstate', data=raw0).fit()

In [None]:
# A summary of the result
print(OLSres.summary())

In [None]:
# interaction and higer order terms
OLSres = smf.ols('palumni ~ np.power(Top10perc,2) + Top10perc*Outstate', data=raw0).fit()
print(OLSres.summary())

In [None]:
# Scatterplot fitted(predicted) values (palumni ~ Top10perc)

plt.scatter(raw0['Top10perc'], OLSres.predict(), alpha=1, label='predicted') # fitted
plt.scatter(raw0['Top10perc'], raw0['palumni'], alpha=0.5, label='observed') # original

plt.legend()
plt.title('OLS predicted values')
plt.xlabel('Top10perc')
plt.ylabel('palumni')
plt.show()

In [None]:
# Access individual estimate: 
# https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLSResults.html#statsmodels.regression.linear_model.OLSResults

OLSres.params # parameter estimates

### <font color='green'> Making a table for multiple regressions using "statsmodels.iolib.summary2"

In [None]:
OLS1 = smf.ols('palumni ~ Top10perc', data=raw0).fit()
OLS2 = smf.ols('palumni ~ Top10perc + Private + Outstate', data=raw0).fit()
OLS3 = smf.ols('palumni ~ Top10perc + Private + Outstate + Personal + Expend', data=raw0).fit()

In [None]:
from statsmodels.iolib.summary2 import summary_col

info_dict={'BIC' : lambda x: f"{x.bic:.2f}",
    'No. observations' : lambda x: f"{int(x.nobs):d}"}

# dictionary is another way to store data, which use "keys" to index elements (instead of numbers): key-value pair

results_table = summary_col(results=[OLS1,OLS2,OLS3],
                            float_format='%0.2f',
                            stars = True,
                            model_names=['Model 1',
                                         'Model 2',
                                         'Model 3'],
                            info_dict=info_dict,
                            regressor_order=['Intercept',
                                             'Top10perc',
                                             'Private',
                                             'Outstate',
                                             'Personal',
                                             'Expend'])

results_table.add_title('OLS Regressions')

print(results_table)

### <font color='darkred'> HW2: Pick five combinations of the regressors to explain the percent of alumni. The regressors may include interactions of two variables in the dataset or squared/cubed variables. 
    
1. Run five regressions with each combination
2. Produce a table summarizing the results of your five regressions as above
    
### One of you will present your regression results in the coming python session. Please check/interpret your regression results carefully