# Quantifying Linear Regression

## Work In Progress

In [36]:
# Import dependencies
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression

In [12]:
df = pd.read_csv('../data_files/TechnicalJobs_DF_draft.csv', index_col=[0], header=0)

In [13]:
df.columns

Index(['area_title', 'jobs_1000', 'tot_emp', 'emp_prse', 'population',
       'median_age', 'average_household_income', 'family_poverty',
       'educational_attainment_bachelors', 'educational_attainment_graduate',
       'educational_attainment_high_school',
       'educational_attainment_no_diploma',
       'educational_attainment_some_college', 'race_asian', 'race_white',
       'race_black', 'race_hispanic', 'race_native', 'race_islander',
       'race_two', 'race_other'],
      dtype='object')

In [91]:
# Target
y = df['jobs_1000']

# Adding all demographic column as X, comment to test
X = df[['population',
       'median_age', 'average_household_income', 'family_poverty',
       'educational_attainment_bachelors', 'educational_attainment_graduate',
       'educational_attainment_high_school',
       'educational_attainment_no_diploma',
       'educational_attainment_some_college', 'race_asian', 'race_white',
       'race_black', 'race_hispanic', 'race_native', 'race_islander',
       'race_two', 'race_other']].copy()


In [90]:
# LabelEncode all, comment to test
# ToDo: Assign variable weights to some/each variable
# Convert any column directly proportional to population into a percent value (Suggested by Kyle)
X['population'] = LabelEncoder().fit_transform(X['population'])
X['family_poverty'] = LabelEncoder().fit_transform(X['family_poverty'])
X['educational_attainment_graduate'] = LabelEncoder().fit_transform(X['educational_attainment_graduate'])
X['average_household_income'] = LabelEncoder().fit_transform(X['average_household_income'])
X['educational_attainment_bachelors'] = LabelEncoder().fit_transform(X['educational_attainment_bachelors'])
X['educational_attainment_high_school'] = LabelEncoder().fit_transform(X['educational_attainment_high_school'])
X['educational_attainment_no_diploma'] = LabelEncoder().fit_transform(X['educational_attainment_no_diploma'])
X['educational_attainment_some_college'] = LabelEncoder().fit_transform(X['educational_attainment_some_college'])
X['race_asian'] = LabelEncoder().fit_transform(X['race_asian'])
X['race_white'] = LabelEncoder().fit_transform(X['race_white'])
X['race_black'] = LabelEncoder().fit_transform(X['race_black'])
X['race_hispanic'] = LabelEncoder().fit_transform(X['race_hispanic'])
X['race_native'] = LabelEncoder().fit_transform(X['race_native'])
X['race_islander'] = LabelEncoder().fit_transform(X['race_islander'])
X['race_two'] = LabelEncoder().fit_transform(X['race_two'])
X['race_other'] = LabelEncoder().fit_transform(X['race_other'])


In [83]:
X

Unnamed: 0,population,median_age,average_household_income,family_poverty,educational_attainment_bachelors,educational_attainment_graduate,educational_attainment_high_school,educational_attainment_no_diploma,educational_attainment_some_college,race_asian,race_white,race_black,race_hispanic,race_native,race_islander,race_two,race_other,educational_attainment_graduate\t
0,110,27,47,93,98,92,108,98,114,99,110,95,122,100,21,97,80,92
1,109,33,32,118,82,71,113,104,109,94,107,129,50,54,25,119,40,71
2,85,37,76,100,74,85,82,84,89,70,53,136,49,49,26,71,34,85
3,146,35,83,136,141,147,137,128,148,133,144,100,151,133,74,133,110,147
4,32,38,73,40,27,28,40,51,29,49,22,97,23,39,2,30,8,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,53,31,66,66,41,31,54,67,56,40,49,76,95,68,16,53,64,31
151,130,34,115,127,127,120,128,129,132,119,139,117,137,123,34,134,60,120
152,59,27,21,28,85,95,25,25,62,86,67,86,59,38,0,57,47,95
153,78,32,53,80,44,54,74,107,75,34,61,22,130,97,13,74,29,54


In [84]:

# Create a linear model
model = LinearRegression()

# Fit (train) our model to the data
model.fit(X, y)

LinearRegression()

## Quantifying our model

* mean squared error (MSE)

* R2 Score

There are a variety of ways to quantify the model, but MSE and R2 are very common

In [85]:
from sklearn.metrics import mean_squared_error, r2_score

# Use our model to predict a value
predicted = model.predict(X)

# Score the prediction with MSE and R2
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted)

print(f"mean squared error (MSE): {mse}")
print(f"R-squared (R2 ): {r2}")

mean squared error (MSE): 376.4697414422712
R-squared (R2 ): 0.403740095034936


A good MSE score will be close to zero, while a good [R2 score](https://en.wikipedia.org/wiki/Coefficient_of_determination) will be close to 1.

R2 score is the default scoring for many of the Sklearn models

In [86]:
# Overall Score for the model
model.score(X, y)

0.403740095034936

## Validation

We also want to understand how well our model performs on new data. 

One approach for this is to split the data into a training and a testing dataset.

We fit (train) the model by using the training data, and we score and validate the model by using the testing data.

This train/test splitting is so common that Sklearn provides a mechanism for doing this. 

## Testing and training data

To quantify our model against new input values, we often split the data into training and testing data. The model is then fit to the training data and scored by the test data. Sklearn preprocessing provides a library for automatically splitting the data into training and testing data.

In [87]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

Train the model by using the training data

In [88]:
model.fit(X_train, y_train)

LinearRegression()

And score the model by using the unseen testing data

In [89]:
model.score(X_test, y_test)

0.20496230369253376