In [33]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# Loading clustered dataset
file = "Resources/MLM/clustered_data.csv"
clustered_df = pd.read_csv(file, index_col=0)
clustered_df

Unnamed: 0_level_0,abortion_status,total_community_health_centers,uninsured,total_insured,maternal_mortality,population,land_area_sqmi,no_doctor_visits,mammogram,no_provider,...,abortions_occurring_state,abortions_residence_state,no_services,few_services,restricted_services,full_service,PC 1,PC 2,PC 3,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,Abortion Ban In Effect,17,165700,1462900,109.285826,4767100,50645,0.12,0.74,0.13,...,5700,9060,8,44,5,0,-3.441609,1.226503,0.812852,0
Alaska,Abortion Available,27,29200,207200,84.486218,701700,570641,0.12,0.63,0.23,...,1240,1320,3,7,0,4,-2.944785,0.679185,3.155152,0
Arizona,Status of pre-Roe ban unclear,23,309000,2132900,87.07292,7098000,113594,0.11,0.68,0.19,...,13320,13820,19,28,7,0,-1.71355,1.250492,0.170481,0
Arkansas,Abortion Ban In Effect,12,83800,865900,124.819154,2922500,52035,0.12,0.7,0.13,...,3250,4510,9,31,3,0,-3.252214,0.370151,1.47372,0
California,Abortion Available,175,1034300,11885100,32.123048,38642700,155779,0.09,0.68,0.19,...,154060,152400,51,97,0,124,10.480858,9.234399,-1.186718,2
Colorado,Abortion Available,19,204300,1758600,43.906723,5611800,103642,0.11,0.67,0.17,...,13420,11830,17,34,0,15,-0.319058,-0.121292,-0.656476,1
Connecticut,Abortion Available,16,58100,1038000,53.795577,3453300,4842,0.07,0.78,0.1,...,11170,11460,12,8,0,18,3.635465,-3.194741,-0.861823,4
Delaware,Abortion Available,3,24800,295800,48.113934,940300,1949,0.09,0.74,0.12,...,1830,2870,10,0,0,2,2.135973,-3.430561,0.971375,4
District of Columbia,Abortion Available,8,7600,251600,78.882128,671300,61,0.06,0.74,0.15,...,9410,5010,2,0,0,2,2.323861,-3.451662,0.935299,4
Florida,"Abortion available, pre-viability gestational ...",47,1011800,6381600,67.248213,20992000,53625,0.14,0.74,0.2,...,77400,73830,22,129,0,48,0.90052,6.484922,-1.530564,3


In [3]:
clustered_df.dtypes

abortion_status                               object
total_community_health_centers                 int64
uninsured                                      int64
total_insured                                  int64
maternal_mortality                           float64
population                                     int64
land_area_sqmi                                 int64
no_doctor_visits                             float64
mammogram                                    float64
no_provider                                  float64
pap_smear                                    float64
prescription_contraception                    object
otc_methods                                   object
male_sterilization                            object
female_sterilization                          object
cost_sharing                                  object
teen_births                                  float64
poverty_under_200                            float64
percent_of_all_us_abortions                  f

In [42]:
# using all features (minus principal components) as independent variables
chc_df = pd.get_dummies(data = clustered_df.drop(columns=['PC 1','PC 2','PC 3','prescription_contraception',
                              'otc_methods','male_sterilization','female_sterilization','cost_sharing']),
                        columns=['abortion_status'])
chc_df.dtypes

total_community_health_centers                                                                        int64
uninsured                                                                                             int64
total_insured                                                                                         int64
maternal_mortality                                                                                  float64
population                                                                                            int64
land_area_sqmi                                                                                        int64
no_doctor_visits                                                                                    float64
mammogram                                                                                           float64
no_provider                                                                                         float64
pap_smear                   

In [18]:
train_df = chc_df[chc_df['Class']==4]
X_train = train_df.drop(columns=['total_community_health_centers','Class'])
y_train = train_df[['total_community_health_centers']]

In [37]:
test_df = chc_df[(chc_df['Class']!=4) & (chc_df['Class']!=2)]
X_test = test_df.drop(columns=['total_community_health_centers','Class'])
y_test = test_df[['total_community_health_centers']]
y_test

Unnamed: 0_level_0,total_community_health_centers
state,Unnamed: 1_level_1
Alabama,17
Alaska,27
Arizona,23
Arkansas,12
Colorado,19
Florida,47
Georgia,35
Hawaii,14
Idaho,14
Indiana,27


In [27]:
# Scaling the remaining columns
scalerX = StandardScaler().fit(X_train)
scalery = StandardScaler().fit(y_train)
X_train_scaled = scalerX.transform(X_train)
y_train_scaled = scalery.transform(y_train)
X_test_scaled = scalerX.transform(X_test)
y_test_scaled = scalery.transform(y_test)
X_test_scaled

array([[ 6.33195608e-01,  7.55474735e-03,  3.55776448e+00,
         2.82344820e-02,  2.57190746e-01,  1.87794214e+00,
         2.45651842e-01, -6.35641726e-02,  1.84021377e-02,
         3.05724904e+00,  1.46053397e+00, -7.15018225e-01,
         2.40711500e+00, -7.12702431e-01, -4.82621623e-01,
         1.23377418e-01,  9.95235149e-01,  5.00000000e+00,
        -1.44145888e+00,  0.00000000e+00, -1.00000000e+00,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [-8.44260811e-01, -1.15439580e+00,  2.02556722e+00,
        -1.13869783e+00,  1.28464274e+01,  1.87794214e+00,
        -2.45651842e+00,  2.16118187e+00, -2.30026721e+00,
         1.33646447e+00,  6.20471599e-01, -1.00943749e+00,
        -4.36733944e-01, -9.94712189e-01, -1.01686290e+00,
        -5.96324188e-01, -7.64259074e-01,  0.00000000e+00,
        -1.03377354e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00

In [21]:
# Creating an object of LinearRegression class
LR = LinearRegression()

# Fitting the training data
LR.fit(X_train_scaled, y_train_scaled)

LinearRegression()

In [22]:
y_pred_scaled = LR.predict(X_test_scaled)
y_pred_scaled

array([[-2.26632539],
       [11.16698436],
       [-0.09820302],
       [-4.85055381],
       [-0.58444413],
       [ 6.47551906],
       [ 0.18234378],
       [-1.52588449],
       [-0.9823054 ],
       [-2.6866812 ],
       [ 0.02533821],
       [-2.84241159],
       [-2.84917612],
       [-1.80297408],
       [ 0.16228198],
       [ 0.35048385],
       [-3.44823538],
       [ 0.99228728],
       [ 0.63369367],
       [-1.61279559],
       [-0.75855547],
       [ 0.75434238],
       [-0.50439831],
       [-0.47535369],
       [-3.9194538 ],
       [ 1.43785822],
       [-0.06718896],
       [-0.6429594 ],
       [ 2.17517691],
       [-1.99085116],
       [14.98364814],
       [-1.0071466 ],
       [-1.85413574],
       [ 1.40182483],
       [ 0.64861789]])

In [24]:
# Returning data to pre-scaled format
y_pred = scalery.inverse_transform(y_pred_scaled)

array([[-5.40626154e+00],
       [ 1.47302521e+02],
       [ 1.92407788e+01],
       [-3.47835620e+01],
       [ 1.37132291e+01],
       [ 9.39703217e+01],
       [ 2.24300123e+01],
       [ 3.01101099e+00],
       [ 9.19037425e+00],
       [-1.01848325e+01],
       [ 2.06451856e+01],
       [-1.19551631e+01],
       [-1.20320617e+01],
       [-1.38921185e-01],
       [ 2.22019514e+01],
       [ 2.43414150e+01],
       [-1.88421190e+01],
       [ 3.16373847e+01],
       [ 2.75609214e+01],
       [ 2.02301255e+00],
       [ 1.17339454e+01],
       [ 2.89324462e+01],
       [ 1.46231836e+01],
       [ 1.49533605e+01],
       [-2.41988921e+01],
       [ 3.67025993e+01],
       [ 1.95933441e+01],
       [ 1.30480322e+01],
       [ 4.50843787e+01],
       [-2.27469268e+00],
       [ 1.90690047e+02],
       [ 8.90798148e+00],
       [-7.20522783e-01],
       [ 3.62929746e+01],
       [ 2.77305787e+01]])

In [29]:
compare_df = y_test['total_community_health_centers'].to_frame()
compare_df['predictions'] = y_pred.astype(int)
compare_df['addl_chc_needed'] = compare_df['predictions'] - compare_df['total_community_health_centers']
compare_df['Class'] = clustered_df['Class']
#compare_df = compare_df[compare_df['addl_chc_needed']>0]
compare_df.sort_values(by=['addl_chc_needed'],ascending=False)

Unnamed: 0_level_0,total_community_health_centers,predictions,addl_chc_needed,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alaska,27,147,120,0
Texas,72,190,118,3
Florida,47,93,46,3
South Dakota,4,45,41,0
Wyoming,6,27,21,0
Wisconsin,16,36,20,1
Montana,14,27,13,0
Rhode Island,8,19,11,1
North Dakota,4,14,10,0
Minnesota,16,24,8,1


In [54]:
# Getting feature importance
importance = LR.coef_

d = {'Feature':X_train.columns, 'Importance':importance[0]}
fi_df = pd.DataFrame(data=d)
print(fi_df)
fi_df.sort_values(by=['Importance'], ascending=False)

                                              Feature  Importance
0                                           uninsured    0.389561
1                                       total_insured    0.334030
2                                  maternal_mortality   -0.209684
3                                          population    0.399309
4                                      land_area_sqmi    1.340777
5                                    no_doctor_visits   -0.553699
6                                           mammogram    0.407875
7                                         no_provider   -0.380262
8                                           pap_smear   -0.066725
9                                         teen_births   -1.122156
10                                  poverty_under_200    0.763528
11                        percent_of_all_us_abortions    0.556908
12          percent_residents_traveling_outside_state    0.717067
13                          abortions_occurring_state    0.589229
14        

Unnamed: 0,Feature,Importance
4,land_area_sqmi,1.340777
10,poverty_under_200,0.763528
12,percent_residents_traveling_outside_state,0.717067
13,abortions_occurring_state,0.589229
11,percent_of_all_us_abortions,0.556908
6,mammogram,0.407875
3,population,0.399309
0,uninsured,0.389561
1,total_insured,0.33403
17,restricted_services,0.0


In [55]:
chc2_df = clustered_df.drop(columns=['PC 1','PC 2','PC 3','prescription_contraception',
                                     'otc_methods','male_sterilization','female_sterilization','cost_sharing',
                                     'abortion_status','maternal_mortality','no_doctor_visits','no_provider',
                                     'pap_smear','teen_births','abortions_residence_state','no_services',
                                     'few_services','restricted_services','full_service',])
chc2_df.head()

Unnamed: 0_level_0,total_community_health_centers,uninsured,total_insured,population,land_area_sqmi,mammogram,poverty_under_200,percent_of_all_us_abortions,percent_residents_traveling_outside_state,abortions_occurring_state,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,17,165700,1462900,4767100,50645,0.74,0.332,0.6,47,5700,0
Alaska,27,29200,207200,701700,570641,0.63,0.285,0.1,7,1240,0
Arizona,23,309000,2132900,7098000,113594,0.68,0.298,1.4,6,13320,0
Arkansas,12,83800,865900,2922500,52035,0.7,0.334,0.3,37,3250,0
California,175,1034300,11885100,38642700,155779,0.68,0.272,16.6,0,154060,2


In [56]:
train2_df = chc2_df[chc2_df['Class']==4]
X_train2 = train2_df.drop(columns=['total_community_health_centers','Class'])
y_train2 = train2_df[['total_community_health_centers']]

In [57]:
test2_df = chc2_df[(chc2_df['Class']!=4) & (chc2_df['Class']!=2)]
X_test2 = test2_df.drop(columns=['total_community_health_centers','Class'])
y_test2 = test2_df[['total_community_health_centers']]

In [59]:
# Scaling the remaining columns
scalerX2 = StandardScaler().fit(X_train2)
scalery2 = StandardScaler().fit(y_train2)
X_train_scaled2 = scalerX2.transform(X_train2)
y_train_scaled2 = scalery2.transform(y_train2)
X_test_scaled2 = scalerX2.transform(X_test2)
y_test_scaled2 = scalery2.transform(y_test2)

In [65]:
# Creating an object of LinearRegression class
LR2 = LinearRegression()

# Fitting the training data
LR2.fit(X_train_scaled2, y_train_scaled2)

LinearRegression()

In [66]:
y_pred_scaled2 = LR2.predict(X_test_scaled2)

In [67]:
# Returning data to pre-scaled format
y_pred2 = scalery2.inverse_transform(y_pred_scaled2)

In [68]:
compare_df2 = y_test2['total_community_health_centers'].to_frame()
compare_df2['predictions'] = y_pred2.astype(int)
compare_df2['addl_chc_needed'] = compare_df2['predictions'] - compare_df2['total_community_health_centers']
compare_df2['Class'] = clustered_df['Class']
#compare_df = compare_df[compare_df['addl_chc_needed']>0]
compare_df.sort_values(by=['addl_chc_needed'],ascending=False)

Unnamed: 0_level_0,total_community_health_centers,predictions,addl_chc_needed,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alaska,27,147,120,0
Texas,72,190,118,3
Florida,47,93,46,3
South Dakota,4,45,41,0
Wyoming,6,27,21,0
Wisconsin,16,36,20,1
Montana,14,27,13,0
Rhode Island,8,19,11,1
North Dakota,4,14,10,0
Minnesota,16,24,8,1


In [69]:
# Getting feature importance
importance2 = LR2.coef_

d2 = {'Feature':X_train2.columns, 'Importance':importance2[0]}
fi_df2 = pd.DataFrame(data=d2)
print(fi_df2)
fi_df.sort_values(by=['Importance'], ascending=False)

                                     Feature  Importance
0                                  uninsured   -1.061690
1                              total_insured   -4.471782
2                                 population    7.066438
3                             land_area_sqmi    0.045600
4                                  mammogram   -0.046098
5                          poverty_under_200    0.084878
6                percent_of_all_us_abortions    4.545018
7  percent_residents_traveling_outside_state   -0.043456
8                  abortions_occurring_state   -5.396517


Unnamed: 0,Feature,Importance
4,land_area_sqmi,1.340777
10,poverty_under_200,0.763528
12,percent_residents_traveling_outside_state,0.717067
13,abortions_occurring_state,0.589229
11,percent_of_all_us_abortions,0.556908
6,mammogram,0.407875
3,population,0.399309
0,uninsured,0.389561
1,total_insured,0.33403
17,restricted_services,0.0


In [183]:
# using only population, land area, and Class (cluster output) as features
chc2_df = clustered_df[['total_community_health_centers','population','land_area_sqmi','Class']]
chc2_df

Unnamed: 0_level_0,total_community_health_centers,population,land_area_sqmi,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,17,4767100,50645,0
Alaska,27,701700,570641,0
Arizona,23,7098000,113594,0
Arkansas,12,2922500,52035,0
California,175,38642700,155779,2
Colorado,19,5611800,103642,1
Connecticut,16,3453300,4842,4
Delaware,3,940300,1949,4
District of Columbia,8,671300,61,4
Florida,47,20992000,53625,3


In [186]:
#training_df = chc2_df[(chc_df['Class']==4) | (chc_df['Class']==2)]
training2_df = chc2_df[chc2_df['Class']==4]
X_train2 = training2_df.drop(columns='total_community_health_centers')
y_train2 = training2_df['total_community_health_centers']

In [187]:
fit2_df = chc2_df
X2 = fit2_df.drop(columns=['total_community_health_centers'])
y2 = fit2_df[['total_community_health_centers']]

In [188]:
# Creating an object of LinearRegression class
LR = LinearRegression()

# Fitting the training data
LR.fit(X_train2, y_train2)

LinearRegression()

In [189]:
y_pred2 = LR.predict(X2)
y_pred2

array([ 20.93461749,  25.3555075 ,  29.05046266,  16.02234673,
       115.00112832,  24.76605675,  16.05570467,   9.22226197,
         8.44423349,  64.59093395,  35.97847458,  10.43558836,
        13.77907482,  41.47362737,  25.18253126,  16.50484242,
        16.60020433,  19.40300633,  19.99414217,  11.04658177,
        22.70348931,  24.7291396 ,  34.53084319,  23.79724587,
        15.72280026,  24.61769852,  13.72386359,  13.9369235 ,
        18.01346604,  10.43233926,  30.21705164,  15.69930428,
        58.80392108,  35.30177167,  10.63797652,  38.32699752,
        18.93936342,  20.55829334,  41.19686679,   9.40150303,
        20.9104531 ,  11.16188139,  25.68874316,  90.30142058,
        17.53310998,   8.51773329,  29.78388448,  28.53522874,
        12.01852602,  23.48184171,  11.01463615])

In [191]:
compare2_df = y2['total_community_health_centers'].to_frame()
compare2_df['predictions'] = y_pred2.astype(int)
compare2_df['addl_chc_needed'] = compare2_df['predictions'] - compare2_df['total_community_health_centers']
compare2_df['Class'] = clustered_df['Class']
compare2_df = compare2_df[compare2_df['addl_chc_needed']>0]
compare2_df.sort_values(by=['addl_chc_needed'],ascending=False)

Unnamed: 0_level_0,total_community_health_centers,predictions,addl_chc_needed,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Texas,72,90,18,3
Florida,47,64,17,3
Nevada,8,18,10,4
New Jersey,23,30,7,4
Wisconsin,16,23,7,1
South Dakota,4,11,7,0
Minnesota,16,23,7,1
Delaware,3,9,6,4
North Dakota,4,10,6,0
Nebraska,7,13,6,0
