In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from joblib import dump
from google.colab import files
import joblib
import sklearn
import plotly.express as px
import numpy as np

**Step 1.** Load dataset

Note: The 'census.csv' file was created using [U2_Build_Data_Cleaning.ipynb](https://colab.research.google.com/drive/1N_smUCm3c5zPi4yUCYwDkZpK2kq-FtJF?usp=sharing).

In [None]:
# Load census dataset
file = '/content/census.csv'
census = pd.read_csv(file)

Step 2: Feature engineering


In [None]:
# Total minority population of county (%)
census['Minority_pop'] = (census['Black'] + census['Hispanic'] + census['Asian'] + census['Native'] + census['Pacific'])

In [None]:
# Top features in decision tree after permutation OLD
#['Hispanic', 'Black', 'IncomePerCap', 'Drive', 'MeanCommute', 'PublicWork']

Step 3: Linear regression model for distance to closest spring from county population center

In [None]:
# Train, val, test split
target = 'closest_spring'
features = census.columns.drop([target, 'CountyId', 'State', 'County', 'Latitude', 'Longitude', 'has_spring', 'coordinates', 'to_spring_cat'])

X = census[features]
y = census[target]

# Split off test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Split off validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [None]:
# Fit model, get R2 score
lin_model = LinearRegression()
lin_model.fit(X_train,y_train)
print(f'Linear regression R2 (training set): {round(lin_model.score(X_train,y_train), 4)}')
print(f'Linear regression R2 (validation set): {round(lin_model.score(X_val,y_val), 4)}')

Linear regression R2 (training set): 0.226
Linear regression R2 (validation set): 0.1854


In [None]:
# Mean absolute error
from sklearn.metrics import mean_absolute_error
pred = lin_model.predict(X_val)
true = y_val
print('mean absolute error', mean_absolute_error(true, pred))

mean absolute error 101.51890106431814


In [None]:
# Select K best
k=17
Kbest = SelectKBest(score_func=f_regression, k=k)
Kbest.fit(X_train, y_train)
X_train_kbest = Kbest.transform(X_train)
X_val_kbest = Kbest.transform(X_val)
X_test_kbest = Kbest.transform(X_test)

kbest_model = LinearRegression()
kbest_model.fit(X_train_kbest, y_train)
print(f'Linear regression R2 (training set): {round(kbest_model.score(X_train_kbest,y_train), 4)}')
print(f'Linear regression R2 (validation set): {round(kbest_model.score(X_val_kbest,y_val), 4)}')

pred = kbest_model.predict(X_val_kbest)
true = y_val
print('mean absolute error', mean_absolute_error(true, pred))

Linear regression R2 (training set): 0.2058
Linear regression R2 (validation set): 0.1864
mean absolute error 100.99097010903917


In [None]:
# Get dataframe with k selected features
cols = Kbest.get_support(indices=True)
X_KbestFeatures = X.iloc[:,cols]
Kbest_cols = list(X_KbestFeatures.columns)
Kbest_cols

['Hispanic',
 'White',
 'Black',
 'Pacific',
 'Poverty',
 'ChildPoverty',
 'Service',
 'Production',
 'Drive',
 'Carpool',
 'Transit',
 'OtherTransp',
 'MeanCommute',
 'PrivateWork',
 'PublicWork',
 'Unemployment',
 'Minority_pop']

Step 4: Export model for Dash app

In [None]:
# Pickle model
dump(kbest_model, 'kbest.joblib', compress=True)
# Get versions
print(f'joblib=={joblib.__version__}')
print(f'scikit-learn=={sklearn.__version__}')

joblib==1.0.1
scikit-learn==0.22.2.post1


In [None]:
files.download('kbest.joblib') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>