In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("data/CASchools.csv")
data.head()

Unnamed: 0,district,school,county,grades,students,teachers,calworks,lunch,computer,expenditure,income,english,read,math,size,score
0,75119,Sunol Glen Unified,Alameda,KK-08,195,10.9,0.5102,2.0408,67,6384.911133,22.690001,0.0,691.599976,690.0,17.889909,690.799988
1,61499,Manzanita Elementary,Butte,KK-08,240,11.15,15.4167,47.916698,101,5099.380859,9.824,4.583333,660.5,661.900024,21.524664,661.200012
2,61549,Thermalito Union Elementary,Butte,KK-08,1550,82.900002,55.032299,76.322601,169,5501.95459,8.978,30.000002,636.299988,650.900024,18.697225,643.600006
3,61457,Golden Feather Union Elementary,Butte,KK-08,243,14.0,36.475399,77.049202,85,7101.831055,8.978,0.0,651.900024,643.5,17.357143,647.700012
4,61523,Palermo Union Elementary,Butte,KK-08,1335,71.5,33.108601,78.427002,171,5235.987793,9.080333,13.857677,641.799988,639.900024,18.671329,640.850006


In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [3]:
numeric_features = ["students", "teachers", "calworks", "lunch", "computer", "expenditure", "income"]

X = data[numeric_features]
y = data.read

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
from sklearn.feature_selection import SelectFromModel
SelectFromModel?

In [5]:
selector = SelectFromModel(DecisionTreeRegressor(), threshold=0.05)

In [6]:
selector.fit(X_train, y_train)

SelectFromModel(estimator=DecisionTreeRegressor(), threshold=0.05)

In [7]:
selector.transform(X_train)

array([[ 56.29940033,  14.22672749],
       [ 87.77819824,   9.70899963],
       [ 80.9756012 ,   8.25800037],
       [ 78.42700195,   9.08033276],
       [ 42.9640007 ,  19.99699974],
       [ 55.7901001 ,  15.59285736],
       [ 35.82089996,  11.42599964],
       [ 91.54640198,   9.63000011],
       [ 35.26789856,   9.70899963],
       [ 35.75910187,  16.29299927],
       [ 76.77259827,   9.98600006],
       [ 55.09389877,  15.05137539],
       [ 82.57219696,  10.23966694],
       [ 32.11009979,   9.66499996],
       [ 63.43280029,   7.38500023],
       [ 23.3010006 ,   8.77600002],
       [ 32.13679886,  23.48374939],
       [ 32.12440109,  17.70899963],
       [  0.        ,  22.52899933],
       [ 44.68090057,   9.48499966],
       [ 36.88999939,  14.59766674],
       [ 24.81480026,  13.46700001],
       [ 51.6529007 ,   9.92599964],
       [  0.59799999,  40.40200043],
       [ 98.13079834,   5.33500004],
       [ 28.125     ,  11.11600018],
       [ 15.3302002 ,  22.13899994],
 

In [8]:
selector.get_support()

array([False, False, False,  True, False, False,  True])

In [10]:
selector.estimator_.feature_importances_

array([0.04961506, 0.02259876, 0.01871219, 0.702662  , 0.02623632,
       0.04157713, 0.13859854])

In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso

In [13]:
pipeline = make_pipeline(StandardScaler(), 
                         SelectFromModel(Lasso(), threshold=0.01),
                         LinearRegression())

pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('selectfrommodel',
                 SelectFromModel(estimator=Lasso(), threshold=0.01)),
                ('linearregression', LinearRegression())])

In [14]:
r2_score(y_test, pipeline.predict(X_test))

0.8542179901902835

In [18]:
pipeline.steps[1][1].get_support()

array([ True, False, False,  True, False,  True,  True])

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
pipeline = make_pipeline(StandardScaler(), 
                         SelectFromModel(Lasso(), threshold=0.01),
                         LinearRegression())
param_grid = {"selectfrommodel__threshold": [0.005, 0.01, 0.1]}

optimizer = GridSearchCV(pipeline, param_grid=param_grid)
optimizer.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('selectfrommodel',
                                        SelectFromModel(estimator=Lasso(),
                                                        threshold=0.01)),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'selectfrommodel__threshold': [0.005, 0.01, 0.1]})

In [21]:
optimizer.best_params_

{'selectfrommodel__threshold': 0.005}

In [22]:
from sklearn.feature_selection import RFE, RFECV
?RFE

In [23]:
pipeline = make_pipeline(StandardScaler(), 
                         RFE(Lasso(), n_features_to_select=5, step=1),
                         LinearRegression())

pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('rfe', RFE(estimator=Lasso(), n_features_to_select=5)),
                ('linearregression', LinearRegression())])

In [24]:
?RFECV

In [26]:
pipeline = make_pipeline(StandardScaler(), 
                         RFECV(Lasso(), step=1, cv=5, scoring="r2", n_jobs=-1),
                         LinearRegression())

pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('rfecv',
                 RFECV(cv=5, estimator=Lasso(), n_jobs=-1, scoring='r2')),
                ('linearregression', LinearRegression())])

In [28]:
pipeline.steps[1][1].get_support()

array([ True, False, False,  True, False,  True,  True])

In [30]:
from sklearn.feature_selection import SelectKBest, f_regression, f_classif
?SelectKBest

In [33]:
pipeline = make_pipeline(StandardScaler(), 
                         SelectKBest(f_regression, k=5),
                         RFE(Lasso(), n_features_to_select=3, step=1),
                         LinearRegression())

pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('selectkbest',
                 SelectKBest(k=5,
                             score_func=<function f_regression at 0x7f83e4e20670>)),
                ('rfe', RFE(estimator=Lasso(), n_features_to_select=3)),
                ('linearregression', LinearRegression())])

In [34]:
r2_score(y_test, pipeline.predict(X_test))

0.8579025167702519