# Sync Link
### Part 4: Modeling

In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [2]:
sync = pd.read_csv('./data/cleaned_sync.csv')

In [3]:
sync.columns

Index(['title', 'artist', 'year', 'explicit', 'styles', 'languages',
       'title_artist', 'd_id', 'd_isrc', 'd_release', 'd_album_id', 'd_album',
       'd_art', 'lyric_url', 'l_writer', 'n_writers', 'n_pub', 'l_pub',
       's_uri', 's_dance', 's_energy', 's_key', 's_loudness', 's_mode',
       's_speech', 's_acoustic', 's_inst', 's_live', 's_valence', 's_tempo',
       's_duration', 's_time_sig', 'score', 'synced', 'Blues', 'Rock',
       'Country', 'Pop', 'Alternative', 'Love', 'Jazz', 'Oldies', 'Soft rock',
       'Disco', 'Funk', 'Electro', 'Soul', 'Rock 'n Roll', 'Christian',
       'Gospel', 'Folk', 'Duet', '80s', 'Reggae', 'Hard/Metal', 'Kids',
       'Teen pop', 'Punk/Grunge', 'Rap', 'Dance', 'Traditionnal', 'Celtic',
       'R&B', 'Latin', 'Ska', 'Musical', 'Christmas', 'Classical', 'Humour',
       'French pop', 'World/Folk', 'Zouk/Creole', 'Schlager'],
      dtype='object')

In [4]:
sync['synced'].value_counts(normalize=True)

0    0.611568
1    0.388432
Name: synced, dtype: float64

Since the classes are split 61/38, 61% is the baseline score to beat.

In [5]:
features_1 = ['year', 'explicit', 's_duration', 's_tempo', 's_energy', 's_mode', 's_valence', 's_key', 'n_pub']

The function below will automate trying out a few different model/feature combinations. Even though this function doesn't address the hyperparameters, I'm hoping by quickly running through a variety of classification models, I can elimanate those that perform substantially poorer before tuning hyperparameters.

In [6]:
def try_model(model, features):
    X = sync[features]
    y = sync['synced']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 21)
    m = model()
    m.fit(X_train, y_train)
    
    r_train = m.score(X_train, y_train)
    r_test = m.score(X_test, y_test)
    
    print(f'Train Score: {r_train}, Test Score: {r_test}, for features: {features}')
    return m

In [7]:
#Ada Boost model
model_1 = try_model(AdaBoostClassifier, features_1)

Train Score: 0.6898640903880793, Test Score: 0.6591355599214146, for features: ['year', 'explicit', 's_duration', 's_tempo', 's_energy', 's_mode', 's_valence', 's_key', 'n_pub']


In [8]:
#Logistic Regression
model_1 = try_model(LogisticRegression, features_1)

Train Score: 0.6191255935811364, Test Score: 0.6070726915520629, for features: ['year', 'explicit', 's_duration', 's_tempo', 's_energy', 's_mode', 's_valence', 's_key', 'n_pub']


In [9]:
#Random Forests
model_1 = try_model(RandomForestClassifier, features_1)

Train Score: 0.9009333551662028, Test Score: 0.6645383104125737, for features: ['year', 'explicit', 's_duration', 's_tempo', 's_energy', 's_mode', 's_valence', 's_key', 'n_pub']


In [10]:
#Extra Trees
model_1 = try_model(ExtraTreesClassifier, features_1)

Train Score: 0.9009333551662028, Test Score: 0.6586444007858546, for features: ['year', 'explicit', 's_duration', 's_tempo', 's_energy', 's_mode', 's_valence', 's_key', 'n_pub']


In [11]:
#Support Vector
model_1 = try_model(LinearSVC, features_1)

Train Score: 0.38545930898968395, Test Score: 0.3973477406679764, for features: ['year', 'explicit', 's_duration', 's_tempo', 's_energy', 's_mode', 's_valence', 's_key', 'n_pub']




Of the models above, the Random Forest has the highest test score. It also has the highest variance, but that can be reigned in with some other techniques.

Next, I'll try a few more models with different features.

In [12]:
features_2 = ['year', 'explicit', 
       'n_writers', 'n_pub', 
       's_dance', 's_energy', 's_key', 's_loudness', 's_mode',
       's_speech', 's_acoustic', 's_inst', 's_live', 's_valence', 's_tempo',
       's_duration', 's_time_sig']

In [13]:
#Logistic Regression
model_2 = try_model(LogisticRegression, features_2)

Train Score: 0.6187981005403635, Test Score: 0.6075638506876228, for features: ['year', 'explicit', 'n_writers', 'n_pub', 's_dance', 's_energy', 's_key', 's_loudness', 's_mode', 's_speech', 's_acoustic', 's_inst', 's_live', 's_valence', 's_tempo', 's_duration', 's_time_sig']


In [14]:
#Random Forests
model_2 = try_model(RandomForestClassifier, features_2)

Train Score: 0.933682659243491, Test Score: 0.6640471512770137, for features: ['year', 'explicit', 'n_writers', 'n_pub', 's_dance', 's_energy', 's_key', 's_loudness', 's_mode', 's_speech', 's_acoustic', 's_inst', 's_live', 's_valence', 's_tempo', 's_duration', 's_time_sig']


In [15]:
#Extra Trees
model_2 = try_model(ExtraTreesClassifier, features_2)

Train Score: 0.933682659243491, Test Score: 0.6630648330058939, for features: ['year', 'explicit', 'n_writers', 'n_pub', 's_dance', 's_energy', 's_key', 's_loudness', 's_mode', 's_speech', 's_acoustic', 's_inst', 's_live', 's_valence', 's_tempo', 's_duration', 's_time_sig']


Again, Random Forests out performs the Logistic Regression. It's looking like that will be the most promising model.

This third batch of features contains the genre dummy columns.

In [16]:
features_3 = ['year', 'explicit', 
       'n_writers', 'n_pub', 
        's_dance', 's_energy', 's_key', 's_loudness', 's_mode',
       's_speech', 's_acoustic', 's_inst', 's_live', 's_valence', 's_tempo',
       's_duration', 's_time_sig','Blues', 'Rock',
       'Country', 'Pop', 'Alternative', 'Love', 'Jazz', 'Oldies', 'Soft rock',
       'Disco', 'Funk', 'Electro', 'Soul', 'Rock \'n Roll', 'Christian',
       'Gospel', 'Folk', 'Duet', '80s', 'Reggae', 'Hard/Metal', 'Kids',
       'Teen pop', 'Punk/Grunge', 'Rap', 'Dance', 'Traditionnal', 'Celtic',
       'R&B', 'Latin', 'Ska', 'Musical', 'Christmas', 'Classical', 'Humour',
       'French pop', 'World/Folk', 'Zouk/Creole', 'Schlager']

In [17]:
#Logistic Regression
model_3 = try_model(LogisticRegression, features_3)

Train Score: 0.6187981005403635, Test Score: 0.6070726915520629, for features: ['year', 'explicit', 'n_writers', 'n_pub', 's_dance', 's_energy', 's_key', 's_loudness', 's_mode', 's_speech', 's_acoustic', 's_inst', 's_live', 's_valence', 's_tempo', 's_duration', 's_time_sig', 'Blues', 'Rock', 'Country', 'Pop', 'Alternative', 'Love', 'Jazz', 'Oldies', 'Soft rock', 'Disco', 'Funk', 'Electro', 'Soul', "Rock 'n Roll", 'Christian', 'Gospel', 'Folk', 'Duet', '80s', 'Reggae', 'Hard/Metal', 'Kids', 'Teen pop', 'Punk/Grunge', 'Rap', 'Dance', 'Traditionnal', 'Celtic', 'R&B', 'Latin', 'Ska', 'Musical', 'Christmas', 'Classical', 'Humour', 'French pop', 'World/Folk', 'Zouk/Creole', 'Schlager']


In [18]:
#Random Forests
model_3 = try_model(RandomForestClassifier, features_3)

Train Score: 0.9924676600622236, Test Score: 0.6944990176817288, for features: ['year', 'explicit', 'n_writers', 'n_pub', 's_dance', 's_energy', 's_key', 's_loudness', 's_mode', 's_speech', 's_acoustic', 's_inst', 's_live', 's_valence', 's_tempo', 's_duration', 's_time_sig', 'Blues', 'Rock', 'Country', 'Pop', 'Alternative', 'Love', 'Jazz', 'Oldies', 'Soft rock', 'Disco', 'Funk', 'Electro', 'Soul', "Rock 'n Roll", 'Christian', 'Gospel', 'Folk', 'Duet', '80s', 'Reggae', 'Hard/Metal', 'Kids', 'Teen pop', 'Punk/Grunge', 'Rap', 'Dance', 'Traditionnal', 'Celtic', 'R&B', 'Latin', 'Ska', 'Musical', 'Christmas', 'Classical', 'Humour', 'French pop', 'World/Folk', 'Zouk/Creole', 'Schlager']


Even with different features, the Random Forests continus to perform the best. My next step will be to incorporate some of the text columns.

In [19]:
features_4 = ['year', 'explicit', 
       'n_writers', 'n_pub', 
        's_dance', 's_energy', 's_key', 's_loudness', 's_mode',
       's_speech', 's_acoustic', 's_inst', 's_live', 's_valence', 's_tempo',
       's_duration', 's_time_sig','Blues', 'Rock',
       'Country', 'Pop', 'Alternative', 'Love', 'Jazz', 'Oldies', 'Soft rock',
       'Disco', 'Funk', 'Electro', 'Soul', 'Rock \'n Roll', 'Christian',
       'Gospel', 'Folk', 'Duet', '80s', 'Reggae', 'Hard/Metal', 'Kids',
       'Teen pop', 'Punk/Grunge', 'Rap', 'Dance', 'Traditionnal', 'Celtic',
       'R&B', 'Latin', 'Ska', 'Musical', 'Christmas', 'Classical', 'Humour',
       'French pop', 'World/Folk', 'Zouk/Creole', 'Schlager', 'text']

In [20]:
sync['text'] = sync['artist'] + " " + sync['l_writer'] + " " + sync['l_pub']

In [21]:
X = sync[features_4]
y = sync['synced']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 21)

In [None]:
# get_numeric = FunctionTransformer(lambda x: x[features_3], validate = False)
# get_text = FunctionTransformer(lambda x: x['text'], validate = False)

In [38]:
def find_numbers(df):
    return df[features_3]

In [39]:
def find_text(df):
    return df['text']

In [42]:
get_numeric = FunctionTransformer(find_numbers, validate = False)
get_text = FunctionTransformer(find_text, validate = False)

In [49]:
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('numeric', Pipeline([
            ('selector', get_numeric),
            ('ss', StandardScaler())
        ])),
        ('text', Pipeline([
            ('selector', get_text),
            ('cvec', CountVectorizer())
        ]))
    ])),
    ('rf', RandomForestClassifier())
], verbose=True)

In [25]:
params = {
    'features__text__cvec__min_df': [0, 3],
    'features__text__cvec__max_features' : [None, 300],
    'rf__max_depth' : [None, 20, 50],
    'rf__min_samples_split' : [2, 5, 10],
    'rf__random_state' : [21]
}

In [26]:
grid = GridSearchCV(pipeline, params, cv = 5)
grid.fit(X_train, y_train)
grid.score(X_train, y_train)

[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.1s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.2s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.2s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.2s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   2.8s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   2.8s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.9s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.9s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.9s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.9s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   0.9s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   2.1s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   2.0s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   2.1s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   2.1s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.1s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.1s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.1s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   5.0s


0.9878827574914033

In [27]:
grid.score(X_test, y_test)

0.6979371316306483

In [28]:
grid.best_params_

{'features__text__cvec__max_features': 300,
 'features__text__cvec__min_df': 0,
 'rf__max_depth': None,
 'rf__min_samples_split': 5,
 'rf__random_state': 21}

In [32]:
params_2 = {
    'features__text__cvec__min_df': [0, 5, 10, 20],
    'features__text__cvec__max_features' : [100, 200, 300],
    'rf__max_depth' : [None, 100, 150],
    'rf__min_samples_split' : [5, 15, 20, 30],
    'rf__random_state' : [21]
}

In [33]:
grid_2 = GridSearchCV(pipeline, params_2, cv = 5)
grid_2.fit(X_train, y_train)
grid_2.score(X_test, y_test)

[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.8s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.0s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.2s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.2s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.2s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.2s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.2s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.9s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.5s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.8s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.6s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.4s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.8s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.3s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.0s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.0s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.8s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.0s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.1s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.1s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.0s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.8s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.7s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipel

0.6969548133595285

In [34]:
grid_2.best_params_

{'features__text__cvec__max_features': 300,
 'features__text__cvec__min_df': 0,
 'rf__max_depth': None,
 'rf__min_samples_split': 15,
 'rf__random_state': 21}

In [46]:
params_3 = {'features__text__cvec__max_features': [300],
 'features__text__cvec__min_df': [0],
 'rf__max_depth': [None],
 'rf__min_samples_split': [15],
 'rf__random_state': [21]}

In [50]:
grid_3 = GridSearchCV(pipeline, params_3, cv = 5)
grid_3.fit(X_train, y_train)
grid_3.score(X_test, y_test)

[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.0s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.0s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.0s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.0s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   3.0s
[Pipeline] .......... (step 1 of 2) Processing features, total=   0.1s
[Pipeline] ................ (step 2 of 2) Processing rf, total=   4.1s


0.6969548133595285

In [83]:
features_2 = ['year', 'explicit', 
       'n_writers', 'n_pub', 
       's_dance', 's_energy', 's_key', 's_loudness', 's_mode',
       's_speech', 's_acoustic', 's_inst', 's_live', 's_valence', 's_tempo',
        's_time_sig']

In [84]:
cvec_pkl = CountVectorizer(max_features = 300)
vec = cvec_pkl.fit_transform(sync['text'])
x_text = pd.DataFrame(vec.toarray(),
             columns=cvec_pkl.get_feature_names())

In [85]:
X = pd.concat([x_text, sync[features_2]], axis = 1)
y = sync['synced']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 21)

In [86]:
rf_pkl = RandomForestClassifier(min_samples_split = 15, random_state = 21)
rf_pkl.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=15,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=21, verbose=0,
                       warm_start=False)

In [87]:
rf_pkl.score(X_test, y_test)

0.7003929273084479

In [88]:
pickle.dump(cvec_pkl, open('./model/cvec.pkl', 'wb'))
pickle.dump(rf_pkl, open('./model/rf.pkl', 'wb'))