In [134]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_classification

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
%matplotlib inline
from matplotlib import pyplot as plt

In [135]:
employment_county_df  = pd.read_csv('../data_files/Employment_by_County.csv', index_col=0)
employment_county_df.head(5)

Unnamed: 0_level_0,county,labor_force,employed,unemployed,unemployed_pct,population,race_white,race_black,race_native,race_asian,race_islander,race_other,race_two_or_more
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1001,"Autauga County, Alabama",25838,24576,1262,4.9,58805,42160,11445,217,881,35,910,3157
1003,"Baldwin County, Alabama",96763,91338,5425,5.6,231767,189399,18217,1582,2067,143,5335,15024
1005,"Barbour County, Alabama",8587,7982,605,7.0,25223,11317,11933,116,117,1,1039,700
1007,"Bibb County, Alabama",8640,8067,573,6.6,22293,16555,4413,60,32,9,465,759
1009,"Blount County, Alabama",24661,23653,1008,4.1,59134,50663,845,337,178,24,3431,3656


In [136]:
employment_county_copy = employment_county_df.copy()

In [137]:
employment_county_copy['race_others']= employment_county_copy[['race_native','race_islander','race_other']].sum(axis=1)
employment_county_copy.head(5)

Unnamed: 0_level_0,county,labor_force,employed,unemployed,unemployed_pct,population,race_white,race_black,race_native,race_asian,race_islander,race_other,race_two_or_more,race_others
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1001,"Autauga County, Alabama",25838,24576,1262,4.9,58805,42160,11445,217,881,35,910,3157,1162
1003,"Baldwin County, Alabama",96763,91338,5425,5.6,231767,189399,18217,1582,2067,143,5335,15024,7060
1005,"Barbour County, Alabama",8587,7982,605,7.0,25223,11317,11933,116,117,1,1039,700,1156
1007,"Bibb County, Alabama",8640,8067,573,6.6,22293,16555,4413,60,32,9,465,759,534
1009,"Blount County, Alabama",24661,23653,1008,4.1,59134,50663,845,337,178,24,3431,3656,3792


In [138]:
# reducing columns to improve model performance 
employment_county_copy = employment_county_copy.drop(['race_native','race_islander','race_other'],axis=1)

In [139]:
employment_county_copy.isnull().values.any()

False

In [140]:
employment_county_copy.shape

(3125, 11)

In [141]:
employment_county_copy.columns.to_list()

['county',
 'labor_force',
 'employed',
 'unemployed',
 'unemployed_pct',
 'population',
 'race_white',
 'race_black',
 'race_asian',
 'race_two_or_more',
 'race_others']

In [142]:
employment_county_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3125 entries, 1001 to 56045
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   county            3125 non-null   object 
 1   labor_force       3125 non-null   int64  
 2   employed          3125 non-null   int64  
 3   unemployed        3125 non-null   int64  
 4   unemployed_pct    3125 non-null   float64
 5   population        3125 non-null   int64  
 6   race_white        3125 non-null   int64  
 7   race_black        3125 non-null   int64  
 8   race_asian        3125 non-null   int64  
 9   race_two_or_more  3125 non-null   int64  
 10  race_others       3125 non-null   int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 293.0+ KB


In [143]:
X = employment_county_copy[['race_white', 'race_black', 'race_asian', 'race_two_or_more', 'race_others']]
y = employment_county_copy['labor_force'] 

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=0)

In [145]:
model = RandomForestRegressor(n_estimators=10,random_state=0)
model.fit(X_train,y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

In [146]:
y_pred = model.predict(X_test)

In [147]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9808327818575827

In [148]:
model.score(X_train,y_train)

0.9756797782440916

In [149]:
model.score(X_test,y_test)

0.9808327818575827

In [150]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
mae

4423.9664

## Feature importance

In [151]:
sorted(zip(model.feature_importances_, X.columns), reverse=True)

[(0.5845257307770042, 'race_white'),
 (0.20918184329404005, 'race_two_or_more'),
 (0.10531549717435326, 'race_asian'),
 (0.06333469709843384, 'race_black'),
 (0.03764223165616864, 'race_others')]

### MAE seem a bit high, but consider the min max range

In [152]:
max_labor_force = employment_county_df['labor_force'].max()
min_labor_force = employment_county_df['labor_force'].min()
print ('min = ', min_labor_force)
print('max = ', max_labor_force)

min =  184
max =  4921499


In [153]:
print(f'Actual:\t\t{list(y_test[:10])}')
print(f'Predicted:\t{list(model.predict(X_test[:10]))}')

Actual:		[5008, 3075, 28305, 27311, 9649, 20710, 15909, 136480, 5561, 13659]
Predicted:	[5178.6, 3277.1, 28422.4, 25899.3, 9625.8, 21722.4, 18502.6, 133694.6, 5313.1, 12604.8]


In [154]:
filename = '../Flask_presentation/static/model/finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))