Train a variety of supervised models on the data and see how well we can do in identifying genre

In [4]:
import numpy as np, pandas as pd

from   sklearn.model_selection  import train_test_split, GridSearchCV
from   sklearn.svm              import LinearSVC
from   sklearn                  import ensemble, tree
from   sklearn.linear_model     import LogisticRegression
from   sklearn.neighbors        import KNeighborsClassifier


In [5]:
# Read in X and y for supervised learning

X = pd.read_parquet('/home/stu/final_capstone/features_final/X.parq') 

y = pd.read_parquet('/home/stu/final_capstone/features_final/y.parq')["genre"].ravel()   # otherwise get complaints!!!

#X.hvplot.table()
print(type(y))
print(y.shape)

<class 'numpy.ndarray'>
(1200,)


In [6]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)  # random_state just for reproducibility


In [7]:
# First try to reproduce test results we saw in the EDA notebook
rfb = ensemble.RandomForestClassifier(n_estimators=600,           
                                      criterion='entropy', 
                                      max_depth=10, 
                                      min_samples_split=2, 
                                      min_samples_leaf=1, 
                                      min_weight_fraction_leaf=0.0, 
                                      max_features='auto', 
                                      max_leaf_nodes=None, 
                                      min_impurity_decrease=0.0, 
                                      min_impurity_split=None, 
                                      bootstrap=True, 
                                      oob_score=False, 
                                      n_jobs=-1,                  # Throw everything at it!!!
                                      random_state=13,            # For reproducibility 
                                      verbose=0,
                                      warm_start=False, 
                                      class_weight=None, 
                                      ccp_alpha=0.0, 
                                      max_samples=None
                                     )
rfb.fit(X_train, y_train)
     
print("R^2 Test:      ", rfb.score(X_test, y_test))


R^2 Test:       0.8133333333333334


* Now let's see if any other models can do better than the random forest; tune each of these in turn:

    * KNN
    * Linear SVC
    * Gradient Boosting
    * Logistic Regression


In [9]:
# Try a KNN classifier

kn = KNeighborsClassifier(
#                           n_neighbors=5,
#                           weights='uniform', 
                          algorithm='auto', 
                          leaf_size=30, 
                          p=2, 
                          metric='minkowski', 
                          metric_params=None, 
                          n_jobs=-1,            # Didn't seem to speed things up much...
                          )
params = {
    'n_neighbors':range(9,11), 
    'weights':['uniform','distance'],  
}
kng = GridSearchCV(kn, params, n_jobs=-1, cv=5, verbose=1, refit=True)
kng.fit(X_train, y_train)
test_score = kng.best_estimator_.score(X_test, y_test)
print("Best Estimator:", kng.best_estimator_) 
print("  Params:      ", kng.best_params_)
print("R^2 Training:  ", kng.best_score_)           
print("R^2 Test:      ", test_score)                


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Estimator: KNeighborsClassifier(n_jobs=-1, n_neighbors=10, weights='distance')
  Params:       {'n_neighbors': 10, 'weights': 'distance'}
R^2 Training:   0.7177777777777777
R^2 Test:       0.7666666666666667


In [10]:
# Now try linear SVC

sv = LinearSVC(penalty='l2', 
               loss='squared_hinge', 
#               dual=True, 
               tol=0.0001, 
#               C=1.0, 
               multi_class='ovr', 
               fit_intercept=True, 
               intercept_scaling=1, 
               class_weight=None, 
               verbose=1,              
               random_state=13,        # For reproducibility
               max_iter=500            # Default is 1000
              ) 
params = {
   'dual':[True, False],
   'C':np.arange(.03,.06,.01),
}
svg = GridSearchCV(sv, params, n_jobs=-1, cv=5, verbose=1, refit=True)
svg.fit(X_train, y_train)
print('# of iterations %s' % svg.best_estimator_.n_iter_)
print("Best Estimator:", svg.best_estimator_) 
print("  Params:      ", svg.best_params_)
print("R^2 Training:  ", svg.best_score_)           
print("R^2 Test:      ", svg.best_estimator_.score(X_test, y_test))


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[LibLinear]# of iterations 269
Best Estimator: LinearSVC(C=0.04, max_iter=500, random_state=13, verbose=1)
  Params:       {'C': 0.04, 'dual': True}
R^2 Training:   0.7566666666666666
R^2 Test:       0.7866666666666666


In [11]:
# Try Gradient Boosting
gb = ensemble.GradientBoostingClassifier(loss='deviance', 
                                         learning_rate=0.1, 
#                                         n_estimators=100, 
                                         subsample=1.0, 
                                         criterion='friedman_mse', 
                                         min_samples_split=2, 
                                         min_samples_leaf=1, 
                                         min_weight_fraction_leaf=0.0, 
#                                         max_depth=3, 
                                         min_impurity_decrease=0.0, 
                                         min_impurity_split=None, 
                                         init=None, 
                                         random_state=13, 
                                         max_features=None, 
                                         verbose=1, 
                                         max_leaf_nodes=None, 
                                         warm_start=False, 
                                         validation_fraction=0.1, 
                                         n_iter_no_change=None, 
                                         tol=0.0001, 
                                         ccp_alpha=0.0,
                                        )
params = {
    'n_estimators':[100, 200, 300],        
    'max_depth':[2, 3, 4],               
}

gbg = GridSearchCV(gb, params, n_jobs=-1, cv=5, verbose=1)
gbg.fit(X_train, y_train)

test_score = gbg.best_estimator_.score(X_test, y_test)

print("Best Estimator:", gbg.best_estimator_) 
print("  Params:      ", gbg.best_params_)
print("R^2 Training:  ", gbg.best_score_)           
print("R^2 Test:      ", test_score)               
print("\n")


Fitting 5 folds for each of 9 candidates, totalling 45 fits
      Iter       Train Loss   Remaining Time 
         1           1.5980            1.54m
         2           1.4449            1.52m
         3           1.3307            1.53m
         4           1.2339            1.52m
         5           1.1546            1.50m
         6           1.0807            1.48m
         7           1.0117            1.47m
         8           0.9552            1.46m
         9           0.9014            1.45m
        10           0.8543            1.44m
        20           0.5366            1.38m
        30           0.3594            1.30m
        40           0.2596            1.24m
        50           0.1946            1.21m
        60           0.1471            1.13m
        70           0.1143            1.04m
        80           0.0888           58.18s
        90           0.0702           53.39s
       100           0.0561           48.72s
       200           0.0066            

In [12]:
# Try logistic Regression

lr = LogisticRegression(
#                       penalty='l2', 
                        dual=False, 
                        tol=0.0001, 
#                        C=1.0, 
                        fit_intercept=True, 
                        intercept_scaling=1, 
                        class_weight=None, 
                        random_state=None, 
                        solver='lbfgs', 
                        max_iter=100,
                        multi_class='auto', 
                        verbose=1, 
                        warm_start=False, 
                        n_jobs=None, 
                        l1_ratio=None
                       )
params = {
    'penalty':[None, 'l2'],
    'C':np.arange(.1,.4, .1),
}
lrg = GridSearchCV(lr, params, n_jobs=-1, cv=5, verbose=1, refit=True)
lrg.fit(X_train, y_train)

test_score = lrg.best_estimator_.score(X_test, y_test)

print('# of iterations %s' % lrg.best_estimator_.n_iter_[0])
print("Best Estimator:", lrg.best_estimator_) 
print("  Params:      ", lrg.best_params_)
print("R^2 Training:  ", lrg.best_score_)      
print("R^2 Test:      ", test_score)


Fitting 5 folds for each of 8 candidates, totalling 40 fits


        nan 0.73555556]
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


# of iterations 87
Best Estimator: LogisticRegression(C=0.2, verbose=1)
  Params:       {'C': 0.2, 'penalty': 'l2'}
R^2 Training:   0.7466666666666667
R^2 Test:       0.7833333333333333


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s finished


In [14]:
# Try a decision tree again, although I doubt it will do well:
dt = tree.DecisionTreeClassifier(
                            criterion='entropy', 
                            splitter='best', 
#                            max_depth=None, 
                            min_samples_split=2, 
                            min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, 
#                            max_features=None, 
                            random_state=13,                  # for reproducibility 
                            max_leaf_nodes=None, 
                            min_impurity_decrease=0.0, 
                            min_impurity_split=None, 
                            class_weight=None, 
                            ccp_alpha=0.0
                           )
params = {
   'max_depth':range(8,11),
   'max_features':[None,'auto'],
}
dtg = GridSearchCV(dt, params, n_jobs=-1, cv=5, verbose=1, refit=True)
dtg.fit(X_train, y_train)
print("Best Estimator:", dtg.best_estimator_) 
print("  Params:      ", dtg.best_params_)
print("R^2 Training:  ", dtg.best_score_)           
print("R^2 Test:      ", dtg.best_estimator_.score(X_test, y_test))
print("\n")


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Estimator: DecisionTreeClassifier(criterion='entropy', max_depth=9, random_state=13)
  Params:       {'max_depth': 9, 'max_features': None}
R^2 Training:   0.5222222222222223
R^2 Test:       0.5266666666666666




### Summary of Supervised Learning Results


| Model               | Key Parameters                 |  R-squared (test) | 
| :-------            | :------------:                 |      :-----:      |
| Gradient Boosting   | 200 estimators, max depth 3    |      82.3%        |
| Random Forest       | 600 estimators, max depth 10   |      81.3%        |
| Linear SVC          | 270 iterations, L2             |      78.7%        |
| Logistic Regression | 87 iterations, L2              |      78.3%        |
| Decision Tree       | Entropy, max depth 9           |      52.7%        |


Observations:

* Random Forest and Gradient Boosting give the best performance; single decision tree is unacceptable.
* Gradient Boosting performs the best, but at a fairly severe performance cost
    * Slow to run even on this data set; might not scale well
* I would choose the random forest as the best supervised learning method for the genre identification task