# Feature Engineering 2: dimensionality and PCA

### Imports and getting set up

In [21]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.feature_selection import SelectKBest, f_classif
import statsmodels.api as sm


### DB Connection & Call

In [2]:
confile = list(pd.read_csv('../../dbcon.csv'))
postgres_db = 'useducation'
db_connection = 'postgresql://{}:{}@{}:{}/{}'.format(confile[0], confile[1], confile[2], confile[3], postgres_db)

In [3]:
query = '''
SELECT *
FROM useducation
;'''

useducation = pd.read_sql(query, db_connection)

In [4]:
display(
    useducation.info(),
    useducation.head(),
    useducation.isna().mean()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1492 entries, 0 to 1491
Data columns (total 25 columns):
PRIMARY_KEY                     1492 non-null object
STATE                           1492 non-null object
YEAR                            1492 non-null int64
ENROLL                          1229 non-null float64
TOTAL_REVENUE                   1280 non-null float64
FEDERAL_REVENUE                 1280 non-null float64
STATE_REVENUE                   1280 non-null float64
LOCAL_REVENUE                   1280 non-null float64
TOTAL_EXPENDITURE               1280 non-null float64
INSTRUCTION_EXPENDITURE         1280 non-null float64
SUPPORT_SERVICES_EXPENDITURE    1280 non-null float64
OTHER_EXPENDITURE               1229 non-null float64
CAPITAL_OUTLAY_EXPENDITURE      1280 non-null float64
GRADES_PK_G                     1319 non-null float64
GRADES_KG_G                     1360 non-null float64
GRADES_4_G                      1361 non-null float64
GRADES_8_G                      1

None

Unnamed: 0,PRIMARY_KEY,STATE,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,...,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G,AVG_MATH_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_4_SCORE,AVG_READING_8_SCORE
0,1992_ALABAMA,ALABAMA,1992,,2678885.0,304177.0,1659028.0,715680.0,2653798.0,1481703.0,...,57948.0,58025.0,41167.0,471564.0,196386.0,676174.0,208.327876,252.187522,207.963517,
1,1992_ALASKA,ALASKA,1992,,1049591.0,106780.0,720711.0,222100.0,972488.0,498362.0,...,9748.0,8789.0,6714.0,79117.0,30847.0,112335.0,,,,258.859712
2,1992_ARIZONA,ARIZONA,1992,,3258079.0,297888.0,1369815.0,1590376.0,3401580.0,1435908.0,...,55433.0,49081.0,37410.0,437127.0,175210.0,614881.0,215.253932,265.366278,206.212716,262.169895
3,1992_ARKANSAS,ARKANSAS,1992,,1711959.0,178571.0,958785.0,574603.0,1743022.0,964323.0,...,34632.0,36011.0,27651.0,281338.0,123113.0,405259.0,210.206028,256.31209,208.634458,264.619665
4,1992_CALIFORNIA,CALIFORNIA,1992,,26260025.0,2072470.0,16546514.0,7641041.0,27138832.0,14358922.0,...,418418.0,363296.0,270675.0,3286034.0,1372011.0,4717112.0,208.398961,260.892247,196.764414,


PRIMARY_KEY                     0.000000
STATE                           0.000000
YEAR                            0.000000
ENROLL                          0.176273
TOTAL_REVENUE                   0.142091
FEDERAL_REVENUE                 0.142091
STATE_REVENUE                   0.142091
LOCAL_REVENUE                   0.142091
TOTAL_EXPENDITURE               0.142091
INSTRUCTION_EXPENDITURE         0.142091
SUPPORT_SERVICES_EXPENDITURE    0.142091
OTHER_EXPENDITURE               0.176273
CAPITAL_OUTLAY_EXPENDITURE      0.142091
GRADES_PK_G                     0.115952
GRADES_KG_G                     0.088472
GRADES_4_G                      0.087802
GRADES_8_G                      0.087802
GRADES_12_G                     0.087802
GRADES_1_8_G                    0.087802
GRADES_9_12_G                   0.087802
GRADES_ALL_G                    0.115952
AVG_MATH_4_SCORE                0.640751
AVG_MATH_8_SCORE                0.643432
AVG_READING_4_SCORE             0.642761
AVG_READING_8_SC

In [5]:
useducation = useducation.fillna(method='ffill')
useducation = useducation.fillna(method='bfill')
display(
    useducation.head(5),
    useducation.isna().mean(),
    
)

Unnamed: 0,PRIMARY_KEY,STATE,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,...,GRADES_4_G,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G,AVG_MATH_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_4_SCORE,AVG_READING_8_SCORE
0,1992_ALABAMA,ALABAMA,1992,89711.0,2678885.0,304177.0,1659028.0,715680.0,2653798.0,1481703.0,...,57948.0,58025.0,41167.0,471564.0,196386.0,676174.0,208.327876,252.187522,207.963517,258.859712
1,1992_ALASKA,ALASKA,1992,89711.0,1049591.0,106780.0,720711.0,222100.0,972488.0,498362.0,...,9748.0,8789.0,6714.0,79117.0,30847.0,112335.0,208.327876,252.187522,207.963517,258.859712
2,1992_ARIZONA,ARIZONA,1992,89711.0,3258079.0,297888.0,1369815.0,1590376.0,3401580.0,1435908.0,...,55433.0,49081.0,37410.0,437127.0,175210.0,614881.0,215.253932,265.366278,206.212716,262.169895
3,1992_ARKANSAS,ARKANSAS,1992,89711.0,1711959.0,178571.0,958785.0,574603.0,1743022.0,964323.0,...,34632.0,36011.0,27651.0,281338.0,123113.0,405259.0,210.206028,256.31209,208.634458,264.619665
4,1992_CALIFORNIA,CALIFORNIA,1992,89711.0,26260025.0,2072470.0,16546514.0,7641041.0,27138832.0,14358922.0,...,418418.0,363296.0,270675.0,3286034.0,1372011.0,4717112.0,208.398961,260.892247,196.764414,264.619665


PRIMARY_KEY                     0.0
STATE                           0.0
YEAR                            0.0
ENROLL                          0.0
TOTAL_REVENUE                   0.0
FEDERAL_REVENUE                 0.0
STATE_REVENUE                   0.0
LOCAL_REVENUE                   0.0
TOTAL_EXPENDITURE               0.0
INSTRUCTION_EXPENDITURE         0.0
SUPPORT_SERVICES_EXPENDITURE    0.0
OTHER_EXPENDITURE               0.0
CAPITAL_OUTLAY_EXPENDITURE      0.0
GRADES_PK_G                     0.0
GRADES_KG_G                     0.0
GRADES_4_G                      0.0
GRADES_8_G                      0.0
GRADES_12_G                     0.0
GRADES_1_8_G                    0.0
GRADES_9_12_G                   0.0
GRADES_ALL_G                    0.0
AVG_MATH_4_SCORE                0.0
AVG_MATH_8_SCORE                0.0
AVG_READING_4_SCORE             0.0
AVG_READING_8_SCORE             0.0
dtype: float64

#### Create a new score variable from the weighted averages of all score variables in the datasets. Notice that the number of students in the 4th grade isn't the same as the number of students in the 8th grade. So, you should appropriately weigh the scores!.

In [13]:
useducation['weighted_avg_score'] = (
    ((useducation['GRADES_4_G'] * ((useducation['AVG_MATH_4_SCORE'] 
                                    + useducation['AVG_READING_4_SCORE']) / 2))
     + (useducation['GRADES_8_G'] * ((useducation['AVG_MATH_8_SCORE'] 
                                      + useducation['AVG_READING_8_SCORE']) / 2))) 
    / (useducation['GRADES_8_G'] + useducation['GRADES_4_G'])
)
useducation

Unnamed: 0,PRIMARY_KEY,STATE,YEAR,ENROLL,TOTAL_REVENUE,FEDERAL_REVENUE,STATE_REVENUE,LOCAL_REVENUE,TOTAL_EXPENDITURE,INSTRUCTION_EXPENDITURE,...,GRADES_8_G,GRADES_12_G,GRADES_1_8_G,GRADES_9_12_G,GRADES_ALL_G,AVG_MATH_4_SCORE,AVG_MATH_8_SCORE,AVG_READING_4_SCORE,AVG_READING_8_SCORE,weighted_avg_score
0,1992_ALABAMA,ALABAMA,1992,89711.0,2678885.0,304177.0,1659028.0,715680.0,2653798.0,1481703.0,...,58025.0,41167.0,471564.0,196386.0,676174.0,208.327876,252.187522,207.963517,258.859712,231.850385
1,1992_ALASKA,ALASKA,1992,89711.0,1049591.0,106780.0,720711.0,222100.0,972488.0,498362.0,...,8789.0,6714.0,79117.0,30847.0,112335.0,208.327876,252.187522,207.963517,258.859712,230.609124
2,1992_ARIZONA,ARIZONA,1992,89711.0,3258079.0,297888.0,1369815.0,1590376.0,3401580.0,1435908.0,...,49081.0,37410.0,437127.0,175210.0,614881.0,215.253932,265.366278,206.212716,262.169895,235.639070
3,1992_ARKANSAS,ARKANSAS,1992,89711.0,1711959.0,178571.0,958785.0,574603.0,1743022.0,964323.0,...,36011.0,27651.0,281338.0,123113.0,405259.0,210.206028,256.312090,208.634458,264.619665,235.441283
4,1992_CALIFORNIA,CALIFORNIA,1992,89711.0,26260025.0,2072470.0,16546514.0,7641041.0,27138832.0,14358922.0,...,363296.0,270675.0,3286034.0,1372011.0,4717112.0,208.398961,260.892247,196.764414,264.619665,230.547249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1487,2017_VIRGINIA,VIRGINIA,2017,1595024.0,23766529.0,1680983.0,9277802.0,12807744.0,23352516.0,12247509.0,...,1103.0,1024.0,9403.0,5084.0,446508.0,248.041059,290.081407,227.598382,273.402640,258.746848
1488,2017_WASHINGTON,WASHINGTON,2017,1595024.0,23766529.0,1680983.0,9277802.0,12807744.0,23352516.0,12247509.0,...,1103.0,1024.0,9403.0,5084.0,446508.0,241.701001,289.131257,223.123409,267.744778,254.341566
1489,2017_WEST_VIRGINIA,WEST_VIRGINIA,2017,1595024.0,23766529.0,1680983.0,9277802.0,12807744.0,23352516.0,12247509.0,...,1103.0,1024.0,9403.0,5084.0,446508.0,236.057019,273.415854,217.316396,271.617023,248.522644
1490,2017_WISCONSIN,WISCONSIN,2017,1595024.0,23766529.0,1680983.0,9277802.0,12807744.0,23352516.0,12247509.0,...,1103.0,1024.0,9403.0,5084.0,446508.0,240.389213,288.143144,219.612694,258.700304,250.689122


#### What are the correlations between this newly created score variable and the expenditure types? Which 1 of the expenditure types is more correlated than the others?

In [20]:
exp_cols = [col for col in useducation.columns if 'EXPENDITURE' in col]
newdata = useducation[exp_cols + ['weighted_avg_score']]
score_corr = newdata.corr()['weighted_avg_score']
score_corr['weighted_avg_score'] = 0
score_corr.sort_values(ascending=False)

SUPPORT_SERVICES_EXPENDITURE    0.249299
INSTRUCTION_EXPENDITURE         0.233953
TOTAL_EXPENDITURE               0.230003
OTHER_EXPENDITURE               0.205974
CAPITAL_OUTLAY_EXPENDITURE      0.142972
weighted_avg_score              0.000000
Name: weighted_avg_score, dtype: float64

SUPPORT SERVICES EXPENDITURE

#### Now, apply PCA to the 4 expenditure types. How much of the total variance is explained by the 1st component?

In [23]:
X = newdata.drop(columns=['weighted_avg_score'])
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
expscore_pca = PCA(4)
expscore_pca.fit(scaled_X)
expscore_pca.explained_variance_ratio_

array([0.95459576, 0.02718245, 0.01518939, 0.00292327])

95%

#### What is the correlation between the overall score variable and the 1st principal component?

In [29]:
newdata['PCA_comp1'] = expscore_pca.transform(scaled_X)[:,0]
newdata.corr()['weighted_avg_score']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


TOTAL_EXPENDITURE               0.230003
INSTRUCTION_EXPENDITURE         0.233953
SUPPORT_SERVICES_EXPENDITURE    0.249299
OTHER_EXPENDITURE               0.205974
CAPITAL_OUTLAY_EXPENDITURE      0.142972
weighted_avg_score              1.000000
PCA_comp1                       0.217959
Name: weighted_avg_score, dtype: float64

22%

#### If you were to choose the best variables for your model, would you prefer using the 1st principal component instead of the expenditure variables? Why?

Yes, as per the goal of PCA, we were able to successfully reduce the dimensions of our data by 3 columns while retaining similar correlation (although we would prefer higher with a component) and 95% explained variance in that component