In [1]:
# basic packages
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from pandas.core.common import SettingWithCopyWarning
pd.options.mode.chained_assignment = None


import numpy as np
import os

In [2]:
# modeling packages
import statsmodels.api as sm
import statsmodels.formula as smf
from statsmodels.iolib.summary2 import summary_col # creating summary tables
import math

# Table 15

In [3]:
# main data
os.chdir('C:\\Users\\rbishwakarma\\Desktop\\value_proposition_in_python\\value_proposition')
df = pd.read_stata('final.dta')

# removing missing universities for the analysis
df = df.dropna(axis = 0, how = 'any', subset = ['retentionrate'])

# dropping values from the missing
df2 = df.dropna(axis = 0, how='any', subset=['stem'])
df2.reset_index(drop=True, inplace=True)

In [4]:
# models
model1 = df[['SCE', 'SCE2', 'R1']]

model2 = df2[['SCE', 'SCE2', 'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem', 'R1']]

model3 = df2[['SCE', 'SCE2', 'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem',
             'female', 'white_percent', 'hispanic_percent', 'asian_percent', 'R1']]

models = [model2, model3]

models_name = ['model2', 'model3']

In [5]:
# weights
# Model 1
y1 = df['retentionrate']

ols = sm.OLS(y1, sm.add_constant(model1)).fit()
reisiduals = abs(ols.resid) # absolute value of the residuals

# step 2
ols2 = sm.OLS(reisiduals, sm.add_constant(df['students_FTE_total'])).fit()
fitted = ols2.predict()

# weight
weights = (1/fitted)**2
df.loc[:,'weight_model1'] = weights


# Models 2 and 3
y = df2['retentionrate']
weight = df2['students_FTE_total']

# creating weights
for i in range(len(models)):
    # step 1
    ols = sm.OLS(y, sm.add_constant(models[i])).fit()
    reisdual = abs(ols.resid) 

    # step 2
    ols2 = sm.OLS(reisdual, sm.add_constant(weight)).fit()
    fitted = ols2.predict()

    # weight
    weights = (1/fitted)**2
    df2.loc[:, 'weight_'+str(models_name[i])] = weights
    

In [6]:
# WLS model 1
m1 = sm.WLS(y1, sm.add_constant(model1), weights = df['weight_model1']).fit() 
df.loc[:, 'predict1'] = m1.predict()

# models 2 and 3
m2 = sm.WLS(y, sm.add_constant(model2), weights = df2['weight_model2']).fit() 
df2.loc[:,'predict2'] = m2.predict()

m3 = sm.WLS(y, sm.add_constant(model3), weights = df2['weight_model3']).fit() 
df2.loc[:,'predict3'] = m3.predict()

In [7]:
# prediction - model1
x = df[df.name == 'University of New Mexico-Main Campus'].predict1
p1 = math.exp(x)/(1+math.exp(x))
print('Predicted UNM m1''  ' '%0.4f' % p1)

# predicted values WLS
columns = ['predict2', 'predict3']
names = ['m2', 'm3']

table_unm = df2.loc[df2['name'] == 'University of New Mexico-Main Campus']

for i in range(len(columns)): 
    x = table_unm[columns[i]]
    p = math.exp(x)/(1+math.exp(x))
    print('Predicted UNM '+str(names[i]) + '  ' '%0.4f' % p)

Predicted UNM m1  0.8801
Predicted UNM m2  0.8275
Predicted UNM m3  0.8325


In [8]:
# OLS models
w = df2['pwht']
s = df2['state']
# regression results
m4 = sm.WLS(y1, sm.add_constant(model1), weights = df['pwht']).fit(cov_type='cluster', cov_kwds = {'groups': df['state']}) 
df.loc[:,'predict4'] = m4.predict()

m5 = sm.WLS(y, sm.add_constant(model2), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df2.loc[:,'predict5'] = m5.predict()

m6 = sm.WLS(y, sm.add_constant(model3), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df2.loc[:,'predict6'] = m6.predict()

In [9]:
# prediction
x = df[df.name == 'University of New Mexico-Main Campus'].predict4
p4 = math.exp(x)/(1+math.exp(x))
print('Predicted UNM m4''  ' '%0.4f' % p4)

#predicted values - OLS
columns = ['predict5', 'predict6']
names = ['m5', 'm6']

table_unm = df2.loc[df2['name'] == 'University of New Mexico-Main Campus']

for i in range(len(columns)): 
    x = table_unm[columns[i]]
    p = math.exp(x)/(1+math.exp(x))
    print('Predicted UNM '+str(names[i]) + '  ' '%0.4f' % p)

Predicted UNM m4  0.8768
Predicted UNM m5  0.8282
Predicted UNM m6  0.8418


In [10]:
# display and print table
table = summary_col([m1, m4, m2, m5, m3, m6], stars=True, float_format='%0.3f',
                      regressor_order = ['SCE', 'SCE2',
                      'R1', 'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem', 'female', 
                      'white_percent', 'hispanic_percent', 
                      'asian_percent', 'const'],
                      model_names = ['(1)\n WLS', '(2)\n OLS-VCE','(3)\n WLS', '(4)\n OLS-VCE', 
                                     '(5) \n WLS', '(6)\n OLS'],
                      info_dict={'R2':lambda x: "{:.2f}".format(x.rsquared),
                                'N':lambda x: "{0:d}".format(int(x.nobs))})
# export table
os.chdir('C:\\Users\\rbishwakarma\\Desktop\\value_proposition_in_python\\value_proposition\\tables')

results_text = table.as_text()

text_file = open("table15.txt", "w")
text_file.write(results_text)
text_file.close()


# Table 16

In [11]:
# main data
os.chdir('C:\\Users\\rbishwakarma\\Desktop\\value_proposition_in_python\\value_proposition')
df = pd.read_stata('final.dta')

# removing missing universities for the analysis
df = df.dropna(axis = 0, how = 'any', subset = ['fourrate'])

# dropping values from the missing
df2 = df.dropna(axis = 0, how='any', subset=['stem'])
df2.reset_index(drop=True, inplace=True)

In [12]:
# models
model1 = df[['SCE', 'SCE2', 'R1']]

model2 = df2[['SCE', 'SCE2', 'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem', 'R1']]

model3 = df2[['SCE', 'SCE2', 'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem',
             'female', 'white_percent', 'hispanic_percent', 'asian_percent', 'R1']]

models = [model2, model3]

models_name = ['model2', 'model3']

In [13]:
# weights
# Model 1
y1 = df['fourrate']

ols = sm.OLS(y1, sm.add_constant(model1)).fit()
reisiduals = abs(ols.resid) # absolute value of the residuals

# step 2
ols2 = sm.OLS(reisiduals, sm.add_constant(df['students_FTE_total'])).fit()
fitted = ols2.predict()

# weight
weights = (1/fitted)**2
df.loc[:,'weight_model1'] = weights


# Models 2 and 3
y = df2['fourrate']
weight = df2['students_FTE_total']

# creating weights
for i in range(len(models)):
    # step 1
    ols = sm.OLS(y, sm.add_constant(models[i])).fit()
    reisdual = abs(ols.resid) 

    # step 2
    ols2 = sm.OLS(reisdual, sm.add_constant(weight)).fit()
    fitted = ols2.predict()

    # weight
    weights = (1/fitted)**2
    df2.loc[:, 'weight_'+str(models_name[i])] = weights

In [14]:
# WLS model 1
m1 = sm.WLS(y1, sm.add_constant(model1), weights = df['weight_model1']).fit() 
df.loc[:, 'predict1'] = m1.predict()

# models 2 and 3
m2 = sm.WLS(y, sm.add_constant(model2), weights = df2['weight_model2']).fit() 
df2.loc[:,'predict2'] = m2.predict()

m3 = sm.WLS(y, sm.add_constant(model3), weights = df2['weight_model3']).fit() 
df2.loc[:,'predict3'] = m3.predict()

In [15]:
# prediction - model1
x = df[df.name == 'University of New Mexico-Main Campus'].predict1
p1 = math.exp(x)/(1+math.exp(x))
print('Predicted UNM m1''  ' '%0.4f' % p1)

# predicted values WLS
columns = ['predict2', 'predict3']
names = ['m2', 'm3']

table_unm = df2.loc[df2['name'] == 'University of New Mexico-Main Campus']

for i in range(len(columns)): 
    x = table_unm[columns[i]]
    p = math.exp(x)/(1+math.exp(x))
    print('Predicted UNM '+str(names[i]) + '  ' '%0.4f' % p)

Predicted UNM m1  0.4275
Predicted UNM m2  0.2931
Predicted UNM m3  0.2940


In [16]:
# OLS models
w = df2['pwht']
s = df2['state']
# regression results
m4 = sm.WLS(y1, sm.add_constant(model1), weights = df['pwht']).fit(cov_type='cluster', cov_kwds = {'groups': df['state']}) 
df.loc[:,'predict4'] = m4.predict()

m5 = sm.WLS(y, sm.add_constant(model2), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df2.loc[:,'predict5'] = m5.predict()

m6 = sm.WLS(y, sm.add_constant(model3), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df2.loc[:,'predict6'] = m6.predict()

In [17]:
# prediction
x = df[df.name == 'University of New Mexico-Main Campus'].predict4
p4 = math.exp(x)/(1+math.exp(x))
print('Predicted UNM m4''  ' '%0.4f' % p4)

#predicted values - OLS
columns = ['predict5', 'predict6']
names = ['m5', 'm6']

table_unm = df2.loc[df2['name'] == 'University of New Mexico-Main Campus']

for i in range(len(columns)): 
    x = table_unm[columns[i]]
    p = math.exp(x)/(1+math.exp(x))
    print('Predicted UNM '+str(names[i]) + '  ' '%0.4f' % p)

Predicted UNM m4  0.4298
Predicted UNM m5  0.2961
Predicted UNM m6  0.2931


In [18]:
# display and print table
table = summary_col([m1, m4, m2, m5, m3, m6], stars=True, float_format='%0.3f',
                      regressor_order = ['SCE', 'SCE2',
                      'R1', 'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem', 'female', 
                      'white_percent', 'hispanic_percent', 
                      'asian_percent', 'const'],
                      model_names = ['(1)\n WLS', '(2)\n OLS-VCE','(3)\n WLS', '(4)\n OLS-VCE', 
                                     '(5) \n WLS', '(6)\n OLS'],
                      info_dict={'R2':lambda x: "{:.2f}".format(x.rsquared),
                                'N':lambda x: "{0:d}".format(int(x.nobs))})
# export table
os.chdir('C:\\Users\\rbishwakarma\\Desktop\\value_proposition_in_python\\value_proposition\\tables')

results_text = table.as_text()

text_file = open("table16.txt", "w")
text_file.write(results_text)
text_file.close()

# Table 17

In [19]:
# main data
os.chdir('C:\\Users\\rbishwakarma\\Desktop\\value_proposition_in_python\\value_proposition')
df = pd.read_stata('final.dta')

# removing missing universities for the analysis
df = df.dropna(axis = 0, how = 'any', subset = ['sixrate'])

# dropping values from the missing
df2 = df.dropna(axis = 0, how='any', subset=['stem'])
df2.reset_index(drop=True, inplace=True)

In [20]:
# models
model1 = df[['SCE', 'SCE2', 'R1']]

model2 = df2[['SCE', 'SCE2', 'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem', 'R1']]

model3 = df2[['SCE', 'SCE2', 'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem',
             'female', 'white_percent', 'hispanic_percent', 'asian_percent', 'R1']]

models = [model2, model3]

models_name = ['model2', 'model3']

In [21]:
# weights
# Model 1
y1 = df['sixrate']

ols = sm.OLS(y1, sm.add_constant(model1)).fit()
reisiduals = abs(ols.resid) # absolute value of the residuals

# step 2
ols2 = sm.OLS(reisiduals, sm.add_constant(df['students_FTE_total'])).fit()
fitted = ols2.predict()

# weight
weights = (1/fitted)**2
df.loc[:,'weight_model1'] = weights


# Models 2 and 3
y = df2['sixrate']
weight = df2['students_FTE_total']

# creating weights
for i in range(len(models)):
    # step 1
    ols = sm.OLS(y, sm.add_constant(models[i])).fit()
    reisdual = abs(ols.resid) 

    # step 2
    ols2 = sm.OLS(reisdual, sm.add_constant(weight)).fit()
    fitted = ols2.predict()

    # weight
    weights = (1/fitted)**2
    df2.loc[:, 'weight_'+str(models_name[i])] = weights

In [22]:
# WLS model 1
m1 = sm.WLS(y1, sm.add_constant(model1), weights = df['weight_model1']).fit() 
df.loc[:, 'predict1'] = m1.predict()

# models 2 and 3
m2 = sm.WLS(y, sm.add_constant(model2), weights = df2['weight_model2']).fit() 
df2.loc[:,'predict2'] = m2.predict()

m3 = sm.WLS(y, sm.add_constant(model3), weights = df2['weight_model3']).fit() 
df2.loc[:,'predict3'] = m3.predict()

In [23]:
# prediction - model1
x = df[df.name == 'University of New Mexico-Main Campus'].predict1
p1 = math.exp(x)/(1+math.exp(x))
print('Predicted UNM m1''  ' '%0.4f' % p1)

# predicted values WLS
columns = ['predict2', 'predict3']
names = ['m2', 'm3']

table_unm = df2.loc[df2['name'] == 'University of New Mexico-Main Campus']

for i in range(len(columns)): 
    x = table_unm[columns[i]]
    p = math.exp(x)/(1+math.exp(x))
    print('Predicted UNM '+str(names[i]) + '  ' '%0.4f' % p)

Predicted UNM m1  0.6932
Predicted UNM m2  0.5788
Predicted UNM m3  0.5747


In [24]:
# OLS models
w = df2['pwht']
s = df2['state']
# regression results
m4 = sm.WLS(y1, sm.add_constant(model1), weights = df['pwht']).fit(cov_type='cluster', cov_kwds = {'groups': df['state']}) 
df.loc[:,'predict4'] = m4.predict()

m5 = sm.WLS(y, sm.add_constant(model2), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df2.loc[:,'predict5'] = m5.predict()

m6 = sm.WLS(y, sm.add_constant(model3), weights = w).fit(cov_type='cluster', cov_kwds = {'groups': s}) 
df2.loc[:,'predict6'] = m6.predict()

In [25]:
# prediction
x = df[df.name == 'University of New Mexico-Main Campus'].predict4
p4 = math.exp(x)/(1+math.exp(x))
print('Predicted UNM m4''  ' '%0.4f' % p4)

#predicted values - OLS
columns = ['predict5', 'predict6']
names = ['m5', 'm6']

table_unm = df2.loc[df2['name'] == 'University of New Mexico-Main Campus']

for i in range(len(columns)): 
    x = table_unm[columns[i]]
    p = math.exp(x)/(1+math.exp(x))
    print('Predicted UNM '+str(names[i]) + '  ' '%0.4f' % p)

Predicted UNM m4  0.6880
Predicted UNM m5  0.5772
Predicted UNM m6  0.5916


In [26]:
# display and print table
table = summary_col([m1, m4, m2, m5, m3, m6], stars=True, float_format='%0.3f',
                      regressor_order = ['SCE', 'SCE2',
                      'R1', 'research_FTE', 'SATtoACT_math_25', 'pell_grants', 'stem', 'female', 
                      'white_percent', 'hispanic_percent', 
                      'asian_percent', 'const'],
                      model_names = ['(1)\n WLS', '(2)\n OLS-VCE','(3)\n WLS', '(4)\n OLS-VCE', 
                                     '(5) \n WLS', '(6)\n OLS'],
                      info_dict={'R2':lambda x: "{:.2f}".format(x.rsquared),
                                'N':lambda x: "{0:d}".format(int(x.nobs))})
# export table
os.chdir('C:\\Users\\rbishwakarma\\Desktop\\value_proposition_in_python\\value_proposition\\tables')

results_text = table.as_text()

text_file = open("table17.txt", "w")
text_file.write(results_text)
text_file.close()