In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sqlalchemy import create_engine
from config import db_password

In [2]:
# Loading clustered dataset
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/Reproducitve_Access"
engine = create_engine(db_string)
clustered_df = pd.read_sql('clustered_data', engine.connect(), index_col='state')
#file = "Resources/MLM/clustered_data.csv"
#clustered_df = pd.read_csv(file, index_col=0)
clustered_df

Unnamed: 0_level_0,abortion_status,total_community_health_centers,uninsured,insured,maternal_mortality,population,no_doctor_visits,mammogram,no_provider,pap_smear,...,abortions_residence_state,no_services,few_services,restricted_services,full_service,land_area_sqmi,PC 1,PC 2,PC 3,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,Abortion Ban In Effect,17,165700,1297200,109.285826,4767100,0.12,0.74,0.13,0.74,...,9060,8,44,5,0,50645,-3.371831,0.786414,0.583273,0
Alaska,Abortion Available,27,29200,178000,84.486218,701700,0.12,0.63,0.23,0.65,...,1320,3,7,0,4,570641,-3.042478,0.957139,3.526903,0
Arizona,Status of pre-Roe ban unclear,23,309000,1823900,87.07292,7098000,0.11,0.68,0.19,0.69,...,13820,19,28,7,0,113594,-1.532947,0.99858,-0.055758,0
Arkansas,Abortion Ban In Effect,12,83800,782100,124.819154,2922500,0.12,0.7,0.13,0.68,...,4510,9,31,3,0,52035,-3.173066,0.138136,1.333202,0
California,Abortion Available,175,1034300,10850800,32.123048,38642700,0.09,0.68,0.19,0.74,...,152400,51,97,0,124,155779,8.765665,9.705473,-1.151322,4
Colorado,Abortion Available,19,204300,1554300,43.906723,5611800,0.11,0.67,0.17,0.69,...,11830,17,34,0,15,103642,-0.428704,-0.073767,-0.44344,3
Connecticut,Abortion Available,16,58100,979900,53.795577,3453300,0.07,0.78,0.1,0.8,...,11460,12,8,0,18,4842,3.868529,-2.793821,-1.031519,1
Delaware,Abortion Available,3,24800,271000,48.113934,940300,0.09,0.74,0.12,0.75,...,2870,10,0,0,2,1949,2.530963,-2.964892,0.810874,1
District of Columbia,Abortion Available,8,7600,244000,78.882128,671300,0.06,0.74,0.15,0.8,...,5010,2,0,0,2,61,2.728529,-2.960128,0.713949,1
Florida,"Abortion available, pre-viability gestational ...",47,1011800,5369800,67.248213,20992000,0.14,0.74,0.2,0.73,...,73830,22,129,0,48,53625,-0.099063,6.005208,-1.264445,2


In [3]:
clustered_df.dtypes

abortion_status                               object
total_community_health_centers                 int64
uninsured                                      int64
insured                                        int64
maternal_mortality                           float64
population                                     int64
no_doctor_visits                             float64
mammogram                                    float64
no_provider                                  float64
pap_smear                                    float64
prescription_contraception                    object
otc_methods                                   object
male_sterilization                            object
female_sterilization                          object
cost_sharing                                  object
teen_births                                  float64
poverty_under_200                            float64
percent_of_all_us_abortions                  float64
percent_residents_traveling_outside_state     

In [4]:
# using all features (minus principal components) as independent variables
chc_df = pd.get_dummies(data = clustered_df.drop(columns=['PC 1','PC 2','PC 3','prescription_contraception',
                              'otc_methods','male_sterilization','female_sterilization','cost_sharing']),
                        columns=['abortion_status'])
chc_df.dtypes

total_community_health_centers                                                                        int64
uninsured                                                                                             int64
insured                                                                                               int64
maternal_mortality                                                                                  float64
population                                                                                            int64
no_doctor_visits                                                                                    float64
mammogram                                                                                           float64
no_provider                                                                                         float64
pap_smear                                                                                           float64
teen_births                 

In [5]:
train_df = chc_df[(chc_df['Class']==4)|(chc_df['Class']==2)]
X_train = train_df.drop(columns=['total_community_health_centers','Class'])
y_train = train_df[['total_community_health_centers']]

In [6]:
test_df = chc_df[(chc_df['Class']!=4) & (chc_df['Class']!=2)]
X_test = test_df.drop(columns=['total_community_health_centers','Class'])
y_test = test_df[['total_community_health_centers']]
y_test

Unnamed: 0_level_0,total_community_health_centers
state,Unnamed: 1_level_1
Alabama,17
Alaska,27
Arizona,23
Arkansas,12
Colorado,19
Connecticut,16
Delaware,3
District of Columbia,8
Georgia,35
Hawaii,14


In [7]:
# Scaling the remaining columns
scalerX = StandardScaler().fit(X_train)
scalery = StandardScaler().fit(y_train)
X_train_scaled = scalerX.transform(X_train)
y_train_scaled = scalery.transform(y_train)
X_test_scaled = scalerX.transform(X_test)
y_test_scaled = scalery.transform(y_test)
X_test_scaled



array([[-1.50831479, -2.65903306,  3.36110615, ...,  0.        ,
         0.        ,  0.        ],
       [-1.73892717, -3.167189  ,  1.73020947, ...,  0.        ,
         0.        ,  0.        ],
       [-1.26621403, -2.41989285,  1.90031877, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-1.73571718, -3.03597289,  0.72968113, ...,  1.        ,
         0.        ,  0.        ],
       [-1.60985182, -2.48636357, -1.98083858, ...,  0.        ,
         0.        ,  1.        ],
       [-1.7559908 , -3.18376127, -0.6063898 , ...,  1.        ,
         0.        ,  0.        ]])

In [8]:
# Creating an object of LinearRegression class
LR = LinearRegression()

# Fitting the training data
LR.fit(X_train_scaled, y_train_scaled)

LinearRegression()

In [9]:
y_pred_scaled = LR.predict(X_test_scaled)
y_pred_scaled

array([[-3.14243038],
       [-1.50266803],
       [-1.88012044],
       [-3.06322486],
       [-1.23850029],
       [-2.2533362 ],
       [-2.6242074 ],
       [-2.91185433],
       [-1.7813512 ],
       [-2.26194875],
       [-1.87735388],
       [-0.9698887 ],
       [-2.32915448],
       [-2.18392495],
       [-2.04117995],
       [-3.08886213],
       [-3.05481735],
       [-2.07305364],
       [-1.91060082],
       [-1.98597703],
       [-1.48910421],
       [-1.4137746 ],
       [-3.15340638],
       [-2.92760319],
       [-1.97665307],
       [-2.4544892 ],
       [-1.77739019],
       [-1.90719969],
       [-1.58179438],
       [-1.79941545],
       [-1.71919047],
       [-2.38568557],
       [-1.7281255 ],
       [-2.1771138 ],
       [-1.76100169],
       [-1.32020901],
       [-2.67365573],
       [-2.96285074],
       [-3.06682222],
       [-2.68971742],
       [-1.37887478],
       [-1.66462776],
       [-1.98437435],
       [-1.34764757],
       [-2.99863354],
       [-1

In [10]:
# Returning data to pre-scaled format
y_pred = scalery.inverse_transform(y_pred_scaled)

In [11]:
compare_df = y_test['total_community_health_centers'].to_frame()
compare_df['predictions'] = y_pred.astype(int)
compare_df['addl_chc_needed'] = compare_df['predictions'] - compare_df['total_community_health_centers']
compare_df['Class'] = clustered_df['Class']
#compare_df = compare_df[compare_df['addl_chc_needed']>0]
compare_df.sort_values(by=['addl_chc_needed'],ascending=False)

Unnamed: 0_level_0,total_community_health_centers,predictions,addl_chc_needed,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Colorado,19,26,7,3
Utah,13,19,6,0
Minnesota,16,18,2,3
Illinois,45,40,-5,1
Washington,27,21,-6,1
Vermont,11,5,-6,1
Nevada,8,0,-8,1
New Jersey,23,9,-14,1
Alaska,27,13,-14,0
New Hampshire,10,-6,-16,3


In [12]:
# Getting feature importance
importance = LR.coef_

d = {'Feature':X_train.columns, 'Importance':importance[0]}
fi_df = pd.DataFrame(data=d)
print(fi_df)
fi_df.sort_values(by=['Importance'], ascending=False)

                                              Feature  Importance
0                                           uninsured    0.054011
1                                             insured    0.155543
2                                  maternal_mortality   -0.140265
3                                          population    0.157948
4                                    no_doctor_visits   -0.018778
5                                           mammogram   -0.138314
6                                         no_provider    0.039526
7                                           pap_smear   -0.051479
8                                         teen_births   -0.013536
9                                   poverty_under_200   -0.039475
10                        percent_of_all_us_abortions    0.098319
11          percent_residents_traveling_outside_state   -0.027477
12                          abortions_occurring_state    0.098741
13                          abortions_residence_state    0.106217
14        

Unnamed: 0,Feature,Importance
3,population,0.157948
1,insured,0.155543
13,abortions_residence_state,0.106217
17,full_service,0.099292
12,abortions_occurring_state,0.098741
10,percent_of_all_us_abortions,0.098319
18,land_area_sqmi,0.071728
0,uninsured,0.054011
14,no_services,0.048009
20,abortion_status_Abortion Available,0.041472


In [14]:
# Accuracy score based on training set
y_train_pred = LR.predict(X_train_scaled)
y_train_pred = scalery.inverse_transform(y_train_pred)
r2_score(y_train, y_train_pred)

1.0

In [15]:
chc2_df = clustered_df.drop(columns=['PC 1','PC 2','PC 3',
                                     'prescription_contraception','abortions_occurring_state',
                                     'percent_residents_traveling_outside_state','mammogram','insured','uninsured',
                                     'otc_methods','percent_of_all_us_abortions','male_sterilization',
                                     'female_sterilization','cost_sharing','abortion_status','no_doctor_visits',
                                     'no_provider','pap_smear','maternal_mortality','teen_births',
                                     'abortions_residence_state','few_services','restricted_services',
                                    'full_service','no_services'])
chc2_df

Unnamed: 0_level_0,total_community_health_centers,population,poverty_under_200,land_area_sqmi,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,17,4767100,0.332,50645,0
Alaska,27,701700,0.285,570641,0
Arizona,23,7098000,0.298,113594,0
Arkansas,12,2922500,0.334,52035,0
California,175,38642700,0.272,155779,4
Colorado,19,5611800,0.219,103642,3
Connecticut,16,3453300,0.228,4842,1
Delaware,3,940300,0.292,1949,1
District of Columbia,8,671300,0.271,61,1
Florida,47,20992000,0.325,53625,2


In [16]:
train2_df = chc2_df[(chc2_df['Class']==4)|(chc2_df['Class']==1)]
X_train2 = train2_df.drop(columns=['total_community_health_centers','Class'])
y_train2 = train2_df[['total_community_health_centers']]

In [17]:
test2_df = chc2_df[(chc2_df['Class']!=4) & (chc2_df['Class']!=1)]
X_test2 = test2_df.drop(columns=['total_community_health_centers','Class'])
y_test2 = test2_df[['total_community_health_centers']]

In [18]:
# Scaling the remaining columns
scalerX2 = StandardScaler().fit(X_train2)
scalery2 = StandardScaler().fit(y_train2)
X_train_scaled2 = scalerX2.transform(X_train2)
y_train_scaled2 = scalery2.transform(y_train2)
X_test_scaled2 = scalerX2.transform(X_test2)
y_test_scaled2 = scalery2.transform(y_test2)



In [19]:
# Creating an object of LinearRegression class
LR2 = LinearRegression()

# Fitting the training data
LR2.fit(X_train_scaled2, y_train_scaled2)

LinearRegression()

In [20]:
y_pred_scaled2 = LR2.predict(X_test_scaled2)

In [21]:
# Returning data to pre-scaled format
y_pred2 = scalery2.inverse_transform(y_pred_scaled2)

In [22]:
compare_df2 = y_test2['total_community_health_centers'].to_frame()
compare_df2['predictions'] = y_pred2.astype(int)
compare_df2['addl_chc_needed'] = compare_df2['predictions'] - compare_df2['total_community_health_centers']
compare_df2['Class'] = clustered_df['Class']
#compare_df2 = compare_df[compare_df['addl_chc_needed']>0]
compare_df2.sort_values(by=['addl_chc_needed'],ascending=False)

Unnamed: 0_level_0,total_community_health_centers,predictions,addl_chc_needed,Class
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Texas,72,130,58,2
Florida,47,85,38,2
Alaska,27,46,19,0
Arizona,23,35,12,0
Minnesota,16,26,10,3
Wisconsin,16,25,9,3
Colorado,19,28,9,3
Georgia,35,43,8,0
Pennsylvania,42,50,8,3
Nebraska,7,12,5,0


In [23]:
# Getting feature importance
importance2 = LR2.coef_

d2 = {'Feature':X_train2.columns, 'Importance':importance2[0]}
fi_df2 = pd.DataFrame(data=d2)
print(fi_df2)
fi_df2.sort_values(by=['Importance'], ascending=False)

             Feature  Importance
0         population    0.922041
1  poverty_under_200    0.000754
2     land_area_sqmi    0.094442


Unnamed: 0,Feature,Importance
0,population,0.922041
2,land_area_sqmi,0.094442
1,poverty_under_200,0.000754


In [24]:
# Accuracy score based on training set
y_train_pred = LR2.predict(X_train_scaled2)
y_train_pred = scalery2.inverse_transform(y_train_pred)
r2_score(y_train2, y_train_pred)

0.951516727365398

In [23]:
# Saving recommendation data
file_path = "Resources/MLM/recommended_chcs.csv"
compare_df2.to_csv(file_path, index=True)

In [24]:
compare_df2.to_sql(name='recommended_chcs', con=engine)