In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from datetime import datetime

import os
import joblib
import pandas as pd


# Helper Functions
def save_model(model, algorithm: str, experiment: str) -> None:

    today = datetime.today().strftime("%Y-%m-%d")
    
    path = os.path.join(
        os.path.dirname(os.path.dirname(os.getcwd())),
        'models',
        'genre_classification',
        algorithm,
        f'{experiment}_{today}_{algorithm}.pkl')

    print("Saving Model:\t{str(path)}")
    joblib.dump(model, path)

In [None]:
data_path = os.path.join(
    os.path.dirname(os.path.dirname(os.getcwd())),
    'data',
    'raw',
    'track_features_10k.tsv')

df = pd.read_csv(data_path, sep='\t')

In [None]:
rf_pipe = Pipeline([
    ('scalar', StandardScaler()),
    ('rf', RandomForestClassifier())
])

mlr_pipe = Pipeline([
    ('scaler', StandardScaler()), 
    ('mlr', LogisticRegression(multi_class='multinomial'))
])

svm_pipe = Pipeline([
    ('scaler', StandardScaler()), 
    ('svc', SVC())
])

mnb_pipe = Pipeline([
    ('scalar', StandardScaler()),
    ('mnb', MultinomialNB())
])

vclf_pipe = Pipeline([
    ('scalar', StandardScaler()),
    ('vclf', VotingClassifier(
        estimators=[
            (
                'rf', 
                RandomForestClassifier(
                    bootstrap=False, 
                    max_depth=70,
                    max_features='sqrt', 
                    min_samples_leaf=5,
                    min_samples_split=5,
                    n_estimators=300)
            ),
            (
                'mlr', 
                LogisticRegression(
                    C=10, 
                    multi_class='multinomial', 
                    penalty='l1', 
                    solver='saga')
            ),
            (
                'svc', 
                SVC(
                    C=1,
                    kernel='rbf',
                    gamma='auto')
            )

        ],
    voting='hard',
    n_jobs=-1))
])

In [None]:
mlr_param_grid = {
    'mlr__solver': ['newton-cg', 'lbfgs', 'saga'],
    'mlr__penalty': ['l1', 'l2'],
    'mlr__C': [0.1, 1, 10],
    'mlr__max_iter': list(range(100, 500, 50))
}
mlr_search = GridSearchCV(
    estimator=mlr_pipe,
    param_grid=mlr_param_grid,
    n_jobs=-1,
    verbose=10)

rf_param_grid = {
    'rf__n_estimators': list(range(100, 500, 50)),
    'rf__max_features': ['auto', 'sqrt'],
    'rf__max_depth': [None] + list(range(10, 110, 20)),
    'rf__min_samples_split': [2,5,10],
    'rf__min_samples_leaf': [1, 2, 5],
    'rf__bootstrap': [False]
}
rf_search = GridSearchCV(
    estimator=rf_pipe,
    param_grid=rf_param_grid,
    n_jobs=-1,
    verbose=10)

svm_param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['rbf'],
    'svc__gamma': ['auto', 'scale']
}
svm_search = GridSearchCV(
    estimator=svm_pipe,
    param_grid=svm_param_grid,
    n_jobs=-1,
    verbose=10)

# mnb_param_grid = {
#     'mnb__alpha': [0, .1, 1, 5]
# }
# mnb_search = GridSearchCV(
#     estimator=mnb_pipe,
#     param_grid=mnb_param_grid,
#     n_jobs=-1,
#     verbose=10)


In [None]:
mlr_search.fit(x_train, y_train)

save_model(
    model=mlr_search,
    algorithm='multinomial_logistic_regression',
    experiment='grid_search_results'
)

NameError: name 'x_train' is not defined

In [None]:
rf_search.fit(x_train, y_train)

save_model(
    model=rf_search,
    algorithm='random_forest',
    experiment='grid_search_results'
)

NameError: name 'x_train' is not defined

In [None]:
svm_search.fit(x_train, y_train)

save_model(
    model=svm_search,
    algorithm='support_vector_machine',
    experiment='grid_search_results'
)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


Saving Model:	{str(path)}


In [None]:
vclf_pipe.fit(x_train, y_train)

Pipeline(steps=[('scalar', StandardScaler()),
                ('vclf',
                 VotingClassifier(estimators=[('rf',
                                               RandomForestClassifier(bootstrap=False,
                                                                      max_depth=70,
                                                                      max_features='sqrt',
                                                                      min_samples_leaf=5,
                                                                      min_samples_split=5,
                                                                      n_estimators=300)),
                                              ('mlr',
                                               LogisticRegression(C=10,
                                                                  multi_class='multinomial',
                                                                  penalty='l1',
                                            

In [None]:
vclf_pipe.score(x_test, y_test)

AttributeError: predict_proba is not available when  probability=False

[CV 1/5; 1/4] START mnb__alpha=0................................................
[CV 1/5; 1/4] END ...................mnb__alpha=0;, score=nan total time=   0.0s
[CV 2/5; 2/4] START mnb__alpha=0.1..............................................
[CV 2/5; 2/4] END .................mnb__alpha=0.1;, score=nan total time=   0.0s
[CV 3/5; 2/4] START mnb__alpha=0.1..............................................
[CV 3/5; 2/4] END .................mnb__alpha=0.1;, score=nan total time=   0.0s
[CV 2/5; 3/4] START mnb__alpha=1................................................
[CV 2/5; 3/4] END ...................mnb__alpha=1;, score=nan total time=   0.0s
[CV 4/5; 4/4] START mnb__alpha=5................................................
[CV 4/5; 4/4] END ...................mnb__alpha=5;, score=nan total time=   0.0s
[CV 2/5; 1/4] START mnb__alpha=0................................................
[CV 2/5; 1/4] END ...................mnb__alpha=0;, score=nan total time=   0.0s
[CV 5/5; 2/4] START mnb__alp

[CV 3/5; 1/4] START mnb__alpha=0................................................
[CV 3/5; 1/4] END ...................mnb__alpha=0;, score=nan total time=   0.0s
[CV 1/5; 3/4] START mnb__alpha=1................................................
[CV 1/5; 3/4] END ...................mnb__alpha=1;, score=nan total time=   0.0s
[CV 5/5; 3/4] START mnb__alpha=1................................................
[CV 5/5; 3/4] END ...................mnb__alpha=1;, score=nan total time=   0.0s
[CV 5/5; 1/4] START mnb__alpha=0................................................
[CV 5/5; 1/4] END ...................mnb__alpha=0;, score=nan total time=   0.0s
[CV 2/5; 4/4] START mnb__alpha=5................................................
[CV 2/5; 4/4] END ...................mnb__alpha=5;, score=nan total time=   0.0s
