In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.cross_validation import train_test_split
from sklearn import tree



# NOTE: 
### Added another column called "lpsa2" which converts the regular "lpsa" into categorical data.
### This was done because I used a Random Forest classifier for the Forward Stepwise Selection.
### Python does not have a package that performs regular stepwise selection on continuous data, which "lpsa" is. The following is how I converted "lpsa" to "lpsa2":
### For "lpsa2", If "lpsa"<1, then 1; If 1<="lpsa"<2, then 2; ... If 5<="lpsa"<6, then 6.

In [2]:
df1 = pd.read_csv('/Users/rodney/Documents/Jupyter/HW_IE_691/HW_1/PCD_Train_2.csv')
df2 = pd.read_csv('/Users/rodney/Documents/Jupyter/HW_IE_691/HW_1/PCD_Test_2.csv')

print(df1)
print(df2)

    Unnamed: 0    lcavol   lweight  age      lbph  svi       lcp  gleason  \
0            1 -0.579818  2.769459   50 -1.386294    0 -1.386294        6   
1            2 -0.994252  3.319626   58 -1.386294    0 -1.386294        6   
2            3 -0.510826  2.691243   74 -1.386294    0 -1.386294        7   
3            4 -1.203973  3.282789   58 -1.386294    0 -1.386294        6   
4            5  0.751416  3.432373   62 -1.386294    0 -1.386294        6   
5            6 -1.049822  3.228826   50 -1.386294    0 -1.386294        6   
6            8  0.693147  3.539509   58  1.536867    0 -1.386294        6   
7           11  0.254642  3.604138   65 -1.386294    0 -1.386294        6   
8           12 -1.347074  3.598681   63  1.266948    0 -1.386294        6   
9           13  1.613430  3.022861   63 -1.386294    0 -0.597837        7   
10          14  1.477049  2.998229   67 -1.386294    0 -1.386294        7   
11          16  1.541159  3.061052   66 -1.386294    0 -1.386294        6   

In [3]:
df1 = df1.drop(columns=['train','lpsa'])
df2 = df2.drop(columns=['train','lpsa'])

print(df1)
print(df2)

    Unnamed: 0    lcavol   lweight  age      lbph  svi       lcp  gleason  \
0            1 -0.579818  2.769459   50 -1.386294    0 -1.386294        6   
1            2 -0.994252  3.319626   58 -1.386294    0 -1.386294        6   
2            3 -0.510826  2.691243   74 -1.386294    0 -1.386294        7   
3            4 -1.203973  3.282789   58 -1.386294    0 -1.386294        6   
4            5  0.751416  3.432373   62 -1.386294    0 -1.386294        6   
5            6 -1.049822  3.228826   50 -1.386294    0 -1.386294        6   
6            8  0.693147  3.539509   58  1.536867    0 -1.386294        6   
7           11  0.254642  3.604138   65 -1.386294    0 -1.386294        6   
8           12 -1.347074  3.598681   63  1.266948    0 -1.386294        6   
9           13  1.613430  3.022861   63 -1.386294    0 -0.597837        7   
10          14  1.477049  2.998229   67 -1.386294    0 -1.386294        7   
11          16  1.541159  3.061052   66 -1.386294    0 -1.386294        6   

In [4]:
X_train = df1[['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']]
y_train = df1['lpsa2']

X_test = df2[['lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason', 'pgg45']]
y_test = df2['lpsa2']

print (X_train)
print (y_train)
# print (X_test)
# print (y_test)

      lcavol   lweight  age      lbph  svi       lcp  gleason  pgg45
0  -0.579818  2.769459   50 -1.386294    0 -1.386294        6      0
1  -0.994252  3.319626   58 -1.386294    0 -1.386294        6      0
2  -0.510826  2.691243   74 -1.386294    0 -1.386294        7     20
3  -1.203973  3.282789   58 -1.386294    0 -1.386294        6      0
4   0.751416  3.432373   62 -1.386294    0 -1.386294        6      0
5  -1.049822  3.228826   50 -1.386294    0 -1.386294        6      0
6   0.693147  3.539509   58  1.536867    0 -1.386294        6      0
7   0.254642  3.604138   65 -1.386294    0 -1.386294        6      0
8  -1.347074  3.598681   63  1.266948    0 -1.386294        6      0
9   1.613430  3.022861   63 -1.386294    0 -0.597837        7     30
10  1.477049  2.998229   67 -1.386294    0 -1.386294        7      5
11  1.541159  3.061052   66 -1.386294    0 -1.386294        6      0
12 -0.415515  3.516013   70  1.244155    0 -0.597837        7     30
13  2.288486  3.649359   66 -1.386

In [5]:
print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)

Training dataset shape: (67, 8) (67,)
Testing dataset shape: (30, 8) (30,)


# NOTE:
### Arbitrarily selected a desired subset of size 4 predictors/features.
### The next shell builds a random forest classifier and uses it for forward stepwise selection.

In [6]:
# Build RF classifier to use in predictor/feature selection
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

# Build step forward predictor/feature selection
sfs1 = sfs(clf,
           k_features=5,
           forward=True,
           floating=False,
           verbose=2,
           scoring='accuracy',
           cv=5)

# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   11.1s finished

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    9.6s finished

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    8.3s finished

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.8s finished



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.5s finished

[2018-09-06 14:40:41] Features: 5/5 -- score: 0.5533554680613504

# INTERPRETATION OF RESULTS:
### The above output shows the iterations. At each iteration, the best available predictor/feature that improves the model was added to the subset.
### The iterations stopped once 5 predictors/features were added to the subset.
### The "score" displayed is the "accuracy" of the model with the selected predictors/features in it. The results show that the best performing model had a subset of 5 predictors/features and an accuracy score of 0.5533554680613504. 

### The next shell displays/prints which 5 predictors/features were selected for the subset using forward piecewise selection.

In [8]:
# Which predictors/freatures?
predictor_cols = list(sfs1.k_feature_idx_)
print(predictor_cols)

# Predictor Columns Key
# 0 = 'lcavol' 
# 1 = 'lweight' 
# 2 = 'age'
# 3 = 'lbph'
# 4 = 'svi', 
# 5 = 'lcp'
# 6 = 'gleason'
# 7 = 'pgg45'

[2, 4, 5, 6, 7]


# INTERPRETATION OF RESULTS:
### The above output shows the 5 predictors/features selected as the best for the model. The feature are 'lweight', 'svi', 'lcp','gleason','pgg45' based off the Key displayed in the above shell

### The next shell builds the full model using just the 5 selected predictors/features instead of using all 8.
### The training and testing accuracy using the 5 selected predictors/features is printed. 

In [10]:
# Build full model with the 5 selected predictors/features
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train[['lweight', 'svi', 'lcp','gleason','pgg45']], y_train)

y_train_pred = clf.predict(X_train[['lweight', 'svi', 'lcp','gleason','pgg45']])
print('Training accuracy on selected predictors/features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test[['lweight', 'svi', 'lcp','gleason','pgg45']])
print('Testing accuracy on selected predictors/features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on selected predictors/features: 0.791
Testing accuracy on selected predictors/features: 0.400


### The next shell builds the full model using ALL the 8 predictors/features instead of using the subset of 5.
### The training and testing accuracy using ALL predictors/features is printed. 

In [11]:
# Build full model on ALL predictors/features, for comparison
clf = RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=4)
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
print('Training accuracy on all predictors/features: %.3f' % acc(y_train, y_train_pred))

y_test_pred = clf.predict(X_test)
print('Testing accuracy on all predictors/features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on all predictors/features: 0.940
Testing accuracy on all predictors/features: 0.500
