In [1]:
# basic packages
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from pandas.core.common import SettingWithCopyWarning
pd.options.mode.chained_assignment = None


import numpy as np
import os

In [2]:
# modeling packages
import statsmodels.api as sm
import statsmodels.formula as smf
from statsmodels.iolib.summary2 import summary_col # creating summary tables
import math

In [3]:
# main data
os.chdir('C:\\Users\\rbishwakarma\\Desktop\\value_proposition_in_python\\value_proposition')
df = pd.read_stata('final.dta')

In [4]:
# removing missing universities for the analysis
df = df.dropna(axis = 0, how = 'any', subset = ['early_careerlog'])

# dropping values from the missing
df = df.dropna(axis = 0, how='any', subset=['stem'])
df.reset_index(drop=True, inplace=True)

# Table 12

In [5]:
# Models
model1 = df[['instruction_FTE', 'academic_support_FTE', 'student_service_FTE', 'R1']]

model2 = df[['instruction_FTE', 'academic_support_FTE', 'student_service_FTE', 
              'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem', 'labor_market', 'R1']]

model3 = df[['academic_support_FTE', 'student_service_FTE', 'research_FTE', 'SATtoACT_math_25',
              'pell_grants', 'stem', 'salary_instruction', 'labor_market', 'R1']]

model4 = df[['academic_support_FTE', 'student_service_FTE', 'research_FTE', 'SATtoACT_math_25', 
             'pell_grants', 'stem', 'salary_instruction', 'female', 'white_percent', 
             'hispanic_percent', 'asian_percent', 'labor_market', 'R1']]

model5 = df[['instruction_FTE', 'academic_support_FTE', 'student_service_FTE', 'research_FTE', 
             'SATtoACT_math_25', 'pell_grants', 'stem', 'female', 'white_percent', 
             'hispanic_percent',  'asian_percent', 'labor_market', 'R1']]

models = [model1, model2, model3, model4, model5]

models_name = ['model1', 'model2', 'model3', 'model4', 'model5']


#Compute weight for the WLS

In [6]:
# Models WLS
y = df['early_careerlog']
weight = df['students_FTE_total']

# creating weights
for i in range(len(models)):
    # step 1
    ols = sm.OLS(y, sm.add_constant(models[i])).fit()
    reisdual = abs(ols.resid) 

    # step 2
    ols2 = sm.OLS(reisdual, sm.add_constant(weight)).fit()
    fitted = ols2.predict()

    # weight
    weights = (1/fitted)**2
    df.loc[:, 'weight_4_'+str(models_name[i])] = weights

#WLS Models

In [7]:
# models 
y = df['early_careerlog']

m1 = sm.WLS(y, sm.add_constant(model1), weights = df['weight_4_model1']).fit() 
df.loc[:, 'predict1'] = m1.predict()

m2 = sm.WLS(y, sm.add_constant(model2), weights = df['weight_4_model2']).fit() 
df.loc[:,'predict2'] = m2.predict()

m3 = sm.WLS(y, sm.add_constant(model3), weights = df['weight_4_model3']).fit() 
df.loc[:,'predict3'] = m3.predict()

m4 = sm.WLS(y, sm.add_constant(model4), weights = df['weight_4_model4']).fit() 
df.loc[:,'predict4'] = m4.predict()

m5 = sm.WLS(y, sm.add_constant(model5), weights = df['weight_4_model5']).fit() 
df.loc[:,'predict5'] = m5.predict()


In [8]:
# predicted values 
import math
columns = ['predict1', 'predict2', 'predict3', 'predict4', 'predict5']
names = ['m1', 'm2', 'm3', 'm4', 'm5']

table_unm = df.loc[df['name'] == 'University of New Mexico-Main Campus']

for i in range(len(columns)): 
    x = table_unm[columns[i]]
    p = math.exp(x)
    print('Predicted UNM '+str(names[i]) + '  ' '%0.2f' % p)

Predicted UNM m1  48771.47
Predicted UNM m2  45421.82
Predicted UNM m3  45166.45
Predicted UNM m4  44767.34
Predicted UNM m5  44998.21


#OLS Models

In [9]:
# models 4, 6, 8, 10 - VCE OLS model
w = df['pwht']
s = df['state']
# regression results
m6 = sm.WLS(y, sm.add_constant(model1), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict6'] = m6.predict()

m7 = sm.WLS(y, sm.add_constant(model2), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict7'] = m7.predict()

m8 = sm.WLS(y, sm.add_constant(model3), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict8'] = m8.predict()

m9 = sm.WLS(y, sm.add_constant(model4), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict9'] = m9.predict()

m10 = sm.WLS(y, sm.add_constant(model5), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict10'] = m10.predict()



In [10]:
#predicted values - OLS
columns = ['predict6', 'predict7', 'predict8', 'predict9', 'predict10']
names = ['m6', 'm7', 'm8', 'm9', 'm10']

table_unm = df.loc[df['name'] == 'University of New Mexico-Main Campus']

for i in range(len(columns)): 
    x = table_unm[columns[i]]
    p = math.exp(x)
    print('Predicted UNM '+str(names[i]) + '  ' '%0.2f' % p)

Predicted UNM m6  49195.74
Predicted UNM m7  45731.67
Predicted UNM m8  45373.66
Predicted UNM m9  45333.39
Predicted UNM m10  45868.43


#export table

In [11]:
# view table
table = summary_col([m1, m6, m2, m7, m3, m8, m4, m9, m5, m10], stars=True, float_format='%0.3f',
                      regressor_order = ['instruction_FTE', 'academic_support_FTE', 'student_service_FTE',
                      'R1', 'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem', 'labor_market', 'salary_instruction', 
                      'female', 
                      'white_percent', 'hispanic_percent', 
                      'asian_percent', 'const'],
                      model_names = ['(1)\n WLS', '(2)\n OLS-VCE','(3)\n WLS', '(4)\n OLS-VCE', '(5) \n WLS',
                                    '(6)\n OLS', '(7)\n WLS', '(8)\n OLS-VCE', '(9)\n WLS', '(10) \n OLS-VCE'],
                      info_dict={'R2':lambda x: "{:.2f}".format(x.rsquared),
                                'N':lambda x: "{0:d}".format(int(x.nobs))})

In [12]:
print(table)


                        (1)       (2)       (3)       (4)       (5)       (6)       (7)       (8)       (9)      (10)   
                         WLS     OLS-VCE     WLS     OLS-VCE     WLS       OLS       WLS     OLS-VCE     WLS     OLS-VCE
------------------------------------------------------------------------------------------------------------------------
instruction_FTE      0.003***  0.003***  0.000     -0.000                                            0.000     -0.000   
                     (0.001)   (0.001)   (0.000)   (0.000)                                           (0.000)   (0.000)  
academic_support_FTE 0.003**   0.000     0.001     0.000     0.000     0.000     0.000     -0.000    0.000     0.000    
                     (0.001)   (0.001)   (0.001)   (0.001)   (0.001)   (0.001)   (0.001)   (0.001)   (0.001)   (0.001)  
student_service_FTE  0.009**   0.011     0.006***  0.009***  0.004*    0.006***  0.003     0.005**   0.006**   0.009*** 
                     (0.005)   

In [13]:
# directory to save tables
import os
os.chdir('C:\\Users\\rbishwakarma\\Desktop\\value_proposition_in_python\\value_proposition\\tables')

results_text = table.as_text()

text_file = open("table12.txt", "w")
text_file.write(results_text)
text_file.close()

# Table 11

In [14]:
# main data
os.chdir('C:\\Users\\rbishwakarma\\Desktop\\value_proposition_in_python\\value_proposition')
df = pd.read_stata('final.dta')

# removing missing universities for the analysis
df = df.dropna(axis = 0, how = 'any', subset = ['early_careerlog'])

# dropping values from the missing
df = df.dropna(axis = 0, how='any', subset=['stem'])
df.reset_index(drop=True, inplace=True)

In [15]:
# Models
model1 = df[['instruction_FTE', 'academic_support_FTE', 'student_service_FTE', 'R1']]

model2 = df[['instruction_FTE', 'academic_support_FTE', 'student_service_FTE', 
              'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem', 'R1']]

model3 = df[['academic_support_FTE', 'student_service_FTE', 'research_FTE', 'SATtoACT_math_25',
              'pell_grants', 'stem', 'salary_instruction', 'R1']]

model4 = df[['academic_support_FTE', 'student_service_FTE', 'research_FTE', 'SATtoACT_math_25', 
             'pell_grants', 'stem', 'salary_instruction', 'female', 'white_percent', 
             'hispanic_percent', 'asian_percent', 'R1']]

model5 = df[['instruction_FTE', 'academic_support_FTE', 'student_service_FTE', 'research_FTE', 
             'SATtoACT_math_25', 'pell_grants', 'stem', 'female', 'white_percent', 
             'hispanic_percent',  'asian_percent', 'R1']]

models = [model1, model2, model3, model4, model5]

models_name = ['model1', 'model2', 'model3', 'model4', 'model5']

In [16]:
# Models WLS
y = df['early_careerlog']
weight = df['students_FTE_total']

# creating weights
for i in range(len(models)):
    # step 1
    ols = sm.OLS(y, sm.add_constant(models[i])).fit()
    reisdual = abs(ols.resid) 

    # step 2
    ols2 = sm.OLS(reisdual, sm.add_constant(weight)).fit()
    fitted = ols2.predict()

    # weight
    weights = (1/fitted)**2
    df.loc[:, 'weight_4_'+str(models_name[i])] = weights

In [17]:
# models WLS
y = df['early_careerlog']

m1 = sm.WLS(y, sm.add_constant(model1), weights = df['weight_4_model1']).fit() 
df.loc[:, 'predict1'] = m1.predict()

m2 = sm.WLS(y, sm.add_constant(model2), weights = df['weight_4_model2']).fit() 
df.loc[:,'predict2'] = m2.predict()

m3 = sm.WLS(y, sm.add_constant(model3), weights = df['weight_4_model3']).fit() 
df.loc[:,'predict3'] = m3.predict()

m4 = sm.WLS(y, sm.add_constant(model4), weights = df['weight_4_model4']).fit() 
df.loc[:,'predict4'] = m4.predict()

m5 = sm.WLS(y, sm.add_constant(model5), weights = df['weight_4_model5']).fit() 
df.loc[:,'predict5'] = m5.predict()

# predicted values 
import math
columns = ['predict1', 'predict2', 'predict3', 'predict4', 'predict5']
names = ['m1', 'm2', 'm3', 'm4', 'm5']

table_unm = df.loc[df['name'] == 'University of New Mexico-Main Campus']

for i in range(len(columns)): 
    x = table_unm[columns[i]]
    p = math.exp(x)
    print('Predicted UNM '+str(names[i]) + '  ' '%0.2f' % p)


Predicted UNM m1  48771.47
Predicted UNM m2  46021.25
Predicted UNM m3  45541.41
Predicted UNM m4  45198.96
Predicted UNM m5  45509.45


In [18]:
# models VCE OLS model
w = df['pwht']
s = df['state']
# regression results
m6 = sm.WLS(y, sm.add_constant(model1), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict6'] = m6.predict()

m7 = sm.WLS(y, sm.add_constant(model2), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict7'] = m7.predict()

m8 = sm.WLS(y, sm.add_constant(model3), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict8'] = m8.predict()

m9 = sm.WLS(y, sm.add_constant(model4), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict9'] = m9.predict()

m10 = sm.WLS(y, sm.add_constant(model5), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict10'] = m10.predict()

#predicted values - OLS
columns = ['predict6', 'predict7', 'predict8', 'predict9', 'predict10']
names = ['m6', 'm7', 'm8', 'm9', 'm10']

table_unm = df.loc[df['name'] == 'University of New Mexico-Main Campus']

for i in range(len(columns)): 
    x = table_unm[columns[i]]
    p = math.exp(x)
    print('Predicted UNM '+str(names[i]) + '  ' '%0.2f' % p)


Predicted UNM m6  49195.74
Predicted UNM m7  46084.15
Predicted UNM m8  45564.13
Predicted UNM m9  45610.93
Predicted UNM m10  46257.79


In [19]:
# view and export table
table = summary_col([m1, m6, m2, m7, m3, m8, m4, m9, m5, m10], stars=True, float_format='%0.4f',
                      regressor_order = ['instruction_FTE', 'academic_support_FTE', 'student_service_FTE',
                      'R1', 'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem', 'salary_instruction', 
                      'female', 
                      'white_percent', 'hispanic_percent', 
                      'asian_percent', 'const'],
                      model_names = ['(1)\n WLS', '(2)\n OLS-VCE','(3)\n WLS', '(4)\n OLS-VCE', '(5) \n WLS',
                                    '(6)\n OLS', '(7)\n WLS', '(8)\n OLS-VCE', '(9)\n WLS', '(10) \n OLS-VCE'],
                      info_dict={'R2':lambda x: "{:.2f}".format(x.rsquared),
                                'N':lambda x: "{0:d}".format(int(x.nobs))})

print(table)

# save table
os.chdir('C:\\Users\\rbishwakarma\\Desktop\\value_proposition_in_python\\value_proposition\\tables')

results_text = table.as_text()

text_file = open("table11.txt", "w")
text_file.write(results_text)
text_file.close()


                        (1)        (2)        (3)        (4)        (5)        (6)        (7)        (8)        (9)       (10)    
                         WLS      OLS-VCE      WLS      OLS-VCE      WLS        OLS        WLS      OLS-VCE      WLS      OLS-VCE 
----------------------------------------------------------------------------------------------------------------------------------
instruction_FTE      0.0029***  0.0033***  0.0002     -0.0001                                                0.0000     -0.0002   
                     (0.0006)   (0.0009)   (0.0005)   (0.0004)                                               (0.0005)   (0.0004)  
academic_support_FTE 0.0029**   0.0004     0.0007     0.0002     0.0002     0.0001     0.0001     -0.0000    0.0005     0.0000    
                     (0.0013)   (0.0014)   (0.0007)   (0.0006)   (0.0007)   (0.0006)   (0.0007)   (0.0006)   (0.0007)   (0.0006)  
student_service_FTE  0.0091**   0.0110     0.0075***  0.0093***  0.0039*    0.0055

# Table 13

In [20]:
# main data
os.chdir('C:\\Users\\rbishwakarma\\Desktop\\value_proposition_in_python\\value_proposition')
df = pd.read_stata('final.dta')

# removing missing universities for the analysis
df = df.dropna(axis = 0, how = 'any', subset = ['early_careerlog'])

# dropping values from the missing
df = df.dropna(axis = 0, how='any', subset=['stem'])
df.reset_index(drop=True, inplace=True)

In [21]:
# Models
model1 = df[['instruction_FTE', 'academic_support_FTE', 'student_service_FTE', 'R1']]

model2 = df[['instruction_FTE', 'academic_support_FTE', 'student_service_FTE', 
              'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem', 'R1', 'labor_market_STEM']]

model3 = df[['academic_support_FTE', 'student_service_FTE', 'research_FTE', 'SATtoACT_math_25',
              'pell_grants', 'stem', 'salary_instruction', 'R1', 'labor_market_STEM']]

model4 = df[['academic_support_FTE', 'student_service_FTE', 'research_FTE', 'SATtoACT_math_25', 
             'pell_grants', 'stem', 'salary_instruction', 'female', 'white_percent', 
             'hispanic_percent', 'asian_percent', 'R1', 'labor_market_STEM']]

model5 = df[['instruction_FTE', 'academic_support_FTE', 'student_service_FTE', 'research_FTE', 
             'SATtoACT_math_25', 'pell_grants', 'stem', 'female', 'white_percent', 
             'hispanic_percent',  'asian_percent', 'R1', 'labor_market_STEM']]

models = [model1, model2, model3, model4, model5]

models_name = ['model1', 'model2', 'model3', 'model4', 'model5']

# Models weight
y = df['early_careerlog']
weight = df['students_FTE_total']

# creating weights
for i in range(len(models)):
    # step 1
    ols = sm.OLS(y, sm.add_constant(models[i])).fit()
    reisdual = abs(ols.resid) 

    # step 2
    ols2 = sm.OLS(reisdual, sm.add_constant(weight)).fit()
    fitted = ols2.predict()

    # weight
    weights = (1/fitted)**2
    df.loc[:, 'weight_4_'+str(models_name[i])] = weights

In [22]:
# models WLS
y = df['early_careerlog']

m1 = sm.WLS(y, sm.add_constant(model1), weights = df['weight_4_model1']).fit() 
df.loc[:, 'predict1'] = m1.predict()

m2 = sm.WLS(y, sm.add_constant(model2), weights = df['weight_4_model2']).fit() 
df.loc[:,'predict2'] = m2.predict()

m3 = sm.WLS(y, sm.add_constant(model3), weights = df['weight_4_model3']).fit() 
df.loc[:,'predict3'] = m3.predict()

m4 = sm.WLS(y, sm.add_constant(model4), weights = df['weight_4_model4']).fit() 
df.loc[:,'predict4'] = m4.predict()

m5 = sm.WLS(y, sm.add_constant(model5), weights = df['weight_4_model5']).fit() 
df.loc[:,'predict5'] = m5.predict()

# predicted values 
import math
columns = ['predict1', 'predict2', 'predict3', 'predict4', 'predict5']
names = ['m1', 'm2', 'm3', 'm4', 'm5']

table_unm = df.loc[df['name'] == 'University of New Mexico-Main Campus']

for i in range(len(columns)): 
    x = table_unm[columns[i]]
    p = math.exp(x)
    print('Predicted UNM '+str(names[i]) + '  ' '%0.2f' % p)


Predicted UNM m1  48771.47
Predicted UNM m2  44618.50
Predicted UNM m3  44643.95
Predicted UNM m4  44562.49
Predicted UNM m5  44586.88


In [23]:
# models VCE OLS model
w = df['pwht']
s = df['state']
# regression results
m6 = sm.WLS(y, sm.add_constant(model1), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict6'] = m6.predict()

m7 = sm.WLS(y, sm.add_constant(model2), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict7'] = m7.predict()

m8 = sm.WLS(y, sm.add_constant(model3), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict8'] = m8.predict()

m9 = sm.WLS(y, sm.add_constant(model4), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict9'] = m9.predict()

m10 = sm.WLS(y, sm.add_constant(model5), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df.loc[:,'predict10'] = m10.predict()

#predicted values - OLS
columns = ['predict6', 'predict7', 'predict8', 'predict9', 'predict10']
names = ['m6', 'm7', 'm8', 'm9', 'm10']

table_unm = df.loc[df['name'] == 'University of New Mexico-Main Campus']

for i in range(len(columns)): 
    x = table_unm[columns[i]]
    p = math.exp(x)
    print('Predicted UNM '+str(names[i]) + '  ' '%0.2f' % p)

Predicted UNM m6  49195.74
Predicted UNM m7  45302.43
Predicted UNM m8  45150.95
Predicted UNM m9  45379.88
Predicted UNM m10  45796.47


In [24]:
# view and export table
table = summary_col([m1, m6, m2, m7, m3, m8, m4, m9, m5, m10], stars=True, float_format='%0.4f',
                      regressor_order = ['instruction_FTE', 'academic_support_FTE', 'student_service_FTE',
                      'R1', 'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem', 'labor_market_STEM', 'salary_instruction', 
                      'female', 
                      'white_percent', 'hispanic_percent', 
                      'asian_percent', 'const'],
                      model_names = ['(1)\n WLS', '(2)\n OLS-VCE','(3)\n WLS', '(4)\n OLS-VCE', '(5) \n WLS',
                                    '(6)\n OLS', '(7)\n WLS', '(8)\n OLS-VCE', '(9)\n WLS', '(10) \n OLS-VCE'],
                      info_dict={'R2':lambda x: "{:.2f}".format(x.rsquared),
                                'N':lambda x: "{0:d}".format(int(x.nobs))})

print(table)

# save table
os.chdir('C:\\Users\\rbishwakarma\\Desktop\\value_proposition_in_python\\value_proposition\\tables')

results_text = table.as_text()

text_file = open("table13.txt", "w")
text_file.write(results_text)
text_file.close()


                        (1)        (2)        (3)        (4)        (5)        (6)        (7)        (8)        (9)       (10)    
                         WLS      OLS-VCE      WLS      OLS-VCE      WLS        OLS        WLS      OLS-VCE      WLS      OLS-VCE 
----------------------------------------------------------------------------------------------------------------------------------
instruction_FTE      0.0029***  0.0033***  0.0003     -0.0000                                                0.0001     -0.0001   
                     (0.0006)   (0.0009)   (0.0005)   (0.0004)                                               (0.0005)   (0.0003)  
academic_support_FTE 0.0029**   0.0004     0.0008     0.0003     0.0003     0.0001     0.0002     -0.0000    0.0005     0.0001    
                     (0.0013)   (0.0014)   (0.0007)   (0.0006)   (0.0007)   (0.0006)   (0.0007)   (0.0007)   (0.0007)   (0.0006)  
student_service_FTE  0.0091**   0.0110     0.0073***  0.0095***  0.0043*    0.0057