In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_classification

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle

%matplotlib inline
from matplotlib import pyplot as plt

In [2]:
employment_county_df  = pd.read_csv('../data_files/Employment_by_County.csv', index_col=0)
employment_county_df.head(5)

Unnamed: 0_level_0,county,labor_force,employed,unemployed,unemployed_pct,population,race_white,race_black,race_native,race_asian,race_islander,race_other,race_two_or_more
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1001,"Autauga County, Alabama",25838,24576,1262,4.9,58805,42160,11445,217,881,35,910,3157
1003,"Baldwin County, Alabama",96763,91338,5425,5.6,231767,189399,18217,1582,2067,143,5335,15024
1005,"Barbour County, Alabama",8587,7982,605,7.0,25223,11317,11933,116,117,1,1039,700
1007,"Bibb County, Alabama",8640,8067,573,6.6,22293,16555,4413,60,32,9,465,759
1009,"Blount County, Alabama",24661,23653,1008,4.1,59134,50663,845,337,178,24,3431,3656


# Copy of the dataframe to feed ML 



In [3]:
employment_county_copy = employment_county_df.copy()

In [4]:
employment_county_copy.columns.to_list()

['county',
 'labor_force',
 'employed',
 'unemployed',
 'unemployed_pct',
 'population',
 'race_white',
 'race_black',
 'race_native',
 'race_asian',
 'race_islander',
 'race_other',
 'race_two_or_more']

In [5]:
employment_county_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3125 entries, 1001 to 56045
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   county            3125 non-null   object 
 1   labor_force       3125 non-null   int64  
 2   employed          3125 non-null   int64  
 3   unemployed        3125 non-null   int64  
 4   unemployed_pct    3125 non-null   float64
 5   population        3125 non-null   int64  
 6   race_white        3125 non-null   int64  
 7   race_black        3125 non-null   int64  
 8   race_native       3125 non-null   int64  
 9   race_asian        3125 non-null   int64  
 10  race_islander     3125 non-null   int64  
 11  race_other        3125 non-null   int64  
 12  race_two_or_more  3125 non-null   int64  
dtypes: float64(1), int64(11), object(1)
memory usage: 341.8+ KB


In [6]:
employment_county_copy['race_others']= employment_county_copy[['race_native','race_islander','race_other']].sum(axis=1)
employment_county_copy.head(5)

Unnamed: 0_level_0,county,labor_force,employed,unemployed,unemployed_pct,population,race_white,race_black,race_native,race_asian,race_islander,race_other,race_two_or_more,race_others
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1001,"Autauga County, Alabama",25838,24576,1262,4.9,58805,42160,11445,217,881,35,910,3157,1162
1003,"Baldwin County, Alabama",96763,91338,5425,5.6,231767,189399,18217,1582,2067,143,5335,15024,7060
1005,"Barbour County, Alabama",8587,7982,605,7.0,25223,11317,11933,116,117,1,1039,700,1156
1007,"Bibb County, Alabama",8640,8067,573,6.6,22293,16555,4413,60,32,9,465,759,534
1009,"Blount County, Alabama",24661,23653,1008,4.1,59134,50663,845,337,178,24,3431,3656,3792


In [7]:
# reducing columns to improve model performance 
employment_county_copy = employment_county_copy.drop(['race_native','race_islander','race_other'],axis=1)

In [8]:
employment_county_copy.isnull().values.any()

False

In [9]:
employment_county_copy.shape

(3125, 11)

In [10]:
employment_county_copy.columns.to_list()

['county',
 'labor_force',
 'employed',
 'unemployed',
 'unemployed_pct',
 'population',
 'race_white',
 'race_black',
 'race_asian',
 'race_two_or_more',
 'race_others']

In [11]:
employment_county_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3125 entries, 1001 to 56045
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   county            3125 non-null   object 
 1   labor_force       3125 non-null   int64  
 2   employed          3125 non-null   int64  
 3   unemployed        3125 non-null   int64  
 4   unemployed_pct    3125 non-null   float64
 5   population        3125 non-null   int64  
 6   race_white        3125 non-null   int64  
 7   race_black        3125 non-null   int64  
 8   race_asian        3125 non-null   int64  
 9   race_two_or_more  3125 non-null   int64  
 10  race_others       3125 non-null   int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 293.0+ KB


# Targeting "labor_force" but we can target any of the employment columns

- labor_force
- employed
- unemployed_pct

In [12]:
X = employment_county_copy[['race_white', 'race_black', 'race_asian', 'race_two_or_more', 'race_others']]
y = employment_county_copy['unemployed'] 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=0)

In [14]:
model = RandomForestRegressor(n_estimators=10,random_state=0)
model.fit(X_train,y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

In [15]:
y_pred = model.predict(X_test)

In [16]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9208485402280945

In [17]:
model.score(X_train,y_train)

0.9300568140070107

In [18]:
model.score(X_test,y_test)

0.9208485402280945

In [19]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
mae

807.2108800000001

## Feature importance

In [20]:
sorted(zip(model.feature_importances_, X.columns), reverse=True)

[(0.31313917769364513, 'race_white'),
 (0.20811900505711378, 'race_others'),
 (0.20096570924356402, 'race_two_or_more'),
 (0.16777759355891686, 'race_asian'),
 (0.1099985144467602, 'race_black')]

In [21]:
filename = '../Flask_presentation/static/model/finalized_model_unemployed.sav'
pickle.dump(model, open(filename, 'wb'))