In [1]:
# Import dependencies
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
#from tensorflow.keras.callbacks import ModelCheckpoint
from sqlalchemy.orm import Session
from sqlalchemy import create_engine
from config import pw_postgres

In [2]:
# Import and read data

DB_PASS = pw_postgres
aws_database = 'shrbfk-final-project.cuitpsvagrne.us-east-2.rds.amazonaws.com:5432'
db_string = "postgresql+psycopg2://postgres:" + DB_PASS + "@" + aws_database + "/vet_data"
try:
    db_engine = create_engine(db_string)
    
except Exception as e:
    print(f"\nFailed to create database connection to {aws_database}.\n", e)
    exit()
    
combined_df = pd.read_sql_table('people_vet_join', db_engine)

In [3]:
combined_df.head()

Unnamed: 0,FIPS,State,County,PopChangeRate1819,PopChangeRate1019,TotalPopEst2019,NetMigrationRate1019,NaturalChangeRate1019,Net_International_Migration_Rate_2010_2019,PopChangeRate0010,...,WhiteVetsPct,BlackVetsPct,HispanicVetsPct,OtherRaceVetsPct,LessThanHSVetsPct,HighSchOnlyVetsPct,SomeCollegeVetsPct,CollegeDegreeVetsPct,EmployeedVetsPct,UnemployeedVetsPct
0,1000,AL,Alabama,0.317,2.461,4903185,1.059,1.402,0.809,7.48,...,72.534198,22.945607,2.022065,3.059596,6.396034,28.257189,37.38537,27.961408,71.330519,4.319786
1,1001,AL,Autauga,0.605,2.001,55869,0.686,1.315,-0.029,24.96,...,84.028832,9.673748,5.576631,1.11912,3.007812,20.859375,31.621094,44.511719,86.54934,3.55846
2,1003,AL,Baldwin,2.469,21.911,223234,21.001,0.91,0.714,29.8,...,90.62198,5.360321,1.439251,2.92936,3.612916,27.659792,36.676731,32.05056,74.927707,3.430532
3,1005,AL,Barbour,-0.748,-9.664,24686,-8.797,-0.867,0.161,-5.44,...,59.563253,37.349398,1.054217,3.087349,12.349398,39.834337,36.746988,11.069277,49.752066,9.966777
4,1007,AL,Bibb,0.121,-2.081,22394,-2.099,0.017,0.525,10.03,...,79.134682,18.492673,2.372645,0.0,6.90151,54.708843,22.286125,16.103523,56.901408,0.0


In [4]:
df_labels = combined_df[['State', 'County']]
combined_df = combined_df.drop(columns=['FIPS', 'State', 'County'])
#combined_df = combined_df.drop(axis=0, index=0)

In [5]:
#combined_df = combined_df.drop(axis=0, index=0)

In [6]:
# Drop non-helpful columns(if there are any)
#combined_df.drop(columns=['columnname1', 'columnname2'], axis=1, inplace=True)
combined_df.head()

Unnamed: 0,PopChangeRate1819,PopChangeRate1019,TotalPopEst2019,NetMigrationRate1019,NaturalChangeRate1019,Net_International_Migration_Rate_2010_2019,PopChangeRate0010,NetMigrationRate0010,NaturalChangeRate0010,Immigration_Rate_2000_2010,...,WhiteVetsPct,BlackVetsPct,HispanicVetsPct,OtherRaceVetsPct,LessThanHSVetsPct,HighSchOnlyVetsPct,SomeCollegeVetsPct,CollegeDegreeVetsPct,EmployeedVetsPct,UnemployeedVetsPct
0,0.317,2.461,4903185,1.059,1.402,0.809,7.48,3.3,3.3,1.222931,...,72.534198,22.945607,2.022065,3.059596,6.396034,28.257189,37.38537,27.961408,71.330519,4.319786
1,0.605,2.001,55869,0.686,1.315,-0.029,24.96,11.87,5.46,-0.010222,...,84.028832,9.673748,5.576631,1.11912,3.007812,20.859375,31.621094,44.511719,86.54934,3.55846
2,2.469,21.911,223234,21.001,0.91,0.714,29.8,26.17,3.32,1.584455,...,90.62198,5.360321,1.439251,2.92936,3.612916,27.659792,36.676731,32.05056,74.927707,3.430532
3,-0.748,-9.664,24686,-8.797,-0.867,0.161,-5.44,-4.8,2.29,1.828365,...,59.563253,37.349398,1.054217,3.087349,12.349398,39.834337,36.746988,11.069277,49.752066,9.966777
4,0.121,-2.081,22394,-2.099,0.017,0.525,10.03,6.43,2.1,0.341485,...,79.134682,18.492673,2.372645,0.0,6.90151,54.708843,22.286125,16.103523,56.901408,0.0


In [7]:
combined_df.columns

Index(['PopChangeRate1819', 'PopChangeRate1019', 'TotalPopEst2019',
       'NetMigrationRate1019', 'NaturalChangeRate1019',
       'Net_International_Migration_Rate_2010_2019', 'PopChangeRate0010',
       'NetMigrationRate0010', 'NaturalChangeRate0010',
       'Immigration_Rate_2000_2010',
       ...
       'WhiteVetsPct', 'BlackVetsPct', 'HispanicVetsPct', 'OtherRaceVetsPct',
       'LessThanHSVetsPct', 'HighSchOnlyVetsPct', 'SomeCollegeVetsPct',
       'CollegeDegreeVetsPct', 'EmployeedVetsPct', 'UnemployeedVetsPct'],
      dtype='object', length=102)

In [8]:
# Look at value counts for binning if needed
column_count= combined_df.TotalVets.value_counts()
column_count

496.0      6
169.0      6
871.0      5
1207.0     5
667.0      5
          ..
25027.0    1
2891.0     1
15642.0    1
768.0      1
3896.0     1
Name: TotalVets, Length: 2536, dtype: int64

In [9]:
# Visualize the value count of column_count to identify which values to replace if needed
#column_count.plot.density()

In [10]:
# Split into features and target arrays
y = combined_df['TotalVets'].values
X = combined_df.drop(['TotalVets'],1).values

In [11]:
# Split preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42)

In [12]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Fit the model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lin_reg_y_pred = lin_reg.predict(X_test)
mse = mean_squared_error(y_test, lin_reg_y_pred)
print(mse)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# Fitting Random Forest Regression to the dataset
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X.reshape(-1,1), y.reshape(-1, 1))

In [None]:
ridge = Ridge()
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)
mse = mean_squared_error(y_test, y_pred_ridge)
print(mse)

In [None]:
lasso = Lasso()
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
mse = mean_squared_error(y_test, y_pred_lasso)
print(mse)

In [None]:
# Fit the model
##lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
# Fit the model
#lin_reg = LinearRegression()
#lin_reg.fit(X_train, y_train)
lin_reg_y_pred = lin_reg.predict(X_test)
mse = mean_squared_error(y_test, lin_reg_y_pred)
print(mse)