In [31]:
#Tables and matrices
import numpy as np
import pandas as pd

#Stats
import scipy.stats as st
from scipy.optimize import fmin
from scipy import integrate
from scipy.stats.mstats import mquantiles
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#from sklearn.preprocessing import OneHotEncoder
from patsy import dmatrix
from mlxtend.feature_selection import SequentialFeatureSelector

#Probabilistic programs
#!pip install numpy mkl #if you are in an intel machine i.e. in mac M# chips no
#!pip install pymc
#!pip install pytensor
import pymc as pm
import pytensor.tensor as pt
#import aesara.tensor as at
print('Running on PyMC v{}'.format(pm.__version__))


#Graphs 
#IMPORTANT: properly install ipywidgets and nodejs for interactive graphs
#If you are in jupyterlab, activate the widget extension (it should be in the latest versions)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.gridspec import GridSpec
from matplotlib import animation, rc
from IPython.display import display, HTML, Markdown
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, HBox, VBox, Layout
from mpl_toolkits.mplot3d import axes3d
import arviz as az

Running on PyMC v5.10.3


# Forward selection

In [83]:
#We want to predict the salary of a baseball player
#Load data
Hitters = pd.read_csv("Hitters.csv")
Hitters = Hitters.dropna().reset_index(drop=True)

#Dummy coding of categorical data
League_l = ['N','A'] #first element will be the reference 0, second 1, and so on
Div_l = ['E', 'W']
dm = dmatrix("~ 1 + C(League, levels=League_l) + C(Division, levels=Div_l) + C(NewLeague, levels=League_l)", 
             Hitters) #Dummy coding of categorical variables
print(dm.design_info.column_names) #to change manually to more readable names

#Concatenate dummy coded variables with non-categorical variables
non_cat_cols = np.append(np.array(Hitters.columns[0:13]), Hitters.columns[15:19])
Hitters_d = pd.concat([pd.DataFrame(dm, columns=["Intercept", "League [1_American]", "Division [1_West]", "New_League [1_American]"]), 
                       Hitters.loc[:, non_cat_cols]], axis = 1)
Hitters_d.drop(columns=['Intercept'], inplace = True)
print(Hitters_d.shape)
Hitters_d.dtypes

['Intercept', 'C(League, levels=League_l)[T.A]', 'C(Division, levels=Div_l)[T.W]', 'C(NewLeague, levels=League_l)[T.A]']
(263, 20)


League [1_American]        float64
Division [1_West]          float64
New_League [1_American]    float64
AtBat                        int64
Hits                         int64
HmRun                        int64
Runs                         int64
RBI                          int64
Walks                        int64
Years                        int64
CAtBat                       int64
CHits                        int64
CHmRun                       int64
CRuns                        int64
CRBI                         int64
CWalks                       int64
PutOuts                      int64
Assists                      int64
Errors                       int64
Salary                     float64
dtype: object

In [76]:
sfs = SequentialFeatureSelector(linear_model.LinearRegression(),
                                k_features=3, #k<features in data, see documentation
                                forward=True,
                                scoring='r2', #goodness measure
                                cv=None #Cross validation
                               )
X = Hitters_d.loc[:, Hitters_d.columns[0:20]]
y = Hitters_d.loc[:, "Salary"]
selected_features = sfs.fit(X, y)

In [81]:
selected_features.k_feature_names_

('League [1_American]', 'Division [1_West]', 'Salary')

In [82]:
selected_features.subsets_

{1: {'feature_idx': (19,),
  'cv_scores': array([1.]),
  'avg_score': 1.0,
  'feature_names': ('Salary',)},
 2: {'feature_idx': (0, 19),
  'cv_scores': array([1.]),
  'avg_score': 1.0,
  'feature_names': ('League [1_American]', 'Salary')},
 3: {'feature_idx': (0, 1, 19),
  'cv_scores': array([1.]),
  'avg_score': 1.0,
  'feature_names': ('League [1_American]', 'Division [1_West]', 'Salary')}}