In [296]:
# Importing libraries
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier,\
                             AdaBoostClassifier
from sklearn.svm import SVC

In [2]:
songs = pd.read_csv('datasets/cleaned_lyrics_with_sentiment_analysis')

# Setting up my train_test_split

In [3]:
X = songs['clean_lyrics']
y = songs['american_songbook']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    stratify=y,
                                                    random_state=42)

# CountVectorizer With StopWords

In [12]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=5000))
])

pipe_params = {
    'cvec__max_features': [2000, 3000, 4000, 5000],
    'cvec__min_df': [1, 2, 3 , 4],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2), (1,3)],
    'cvec__stop_words': [None, 'english'],
}

gs = GridSearchCV(pipe,
                  pipe_params,
                  cv=5)

In [13]:
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [14]:
gs.best_score_

0.7745168599226976

In [15]:
gs.score(X_test, y_test)

0.7745098039215687

In [16]:
gs.best_params_

{'cvec__max_df': 0.95,
 'cvec__max_features': 2000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 1),
 'cvec__stop_words': None}

# TFIDFVectorizer

In [17]:
tfidf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

tfidf_pipe_params = {
    'tfidf__max_features': [2000, 3000, 4000, 5000],
    'tfidf__min_df': [1, 2, 3 , 4],
    'tfidf__max_df': [.9, .95],
    'tfidf__ngram_range': [(1,1), (1,2), (2,2)],
    'tfidf__stop_words': [None, 'english']
}

gs_tfidf = GridSearchCV(tfidf_pipe,
                        tfidf_pipe_params,
                        cv=5)

In [18]:
gs_tfidf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [19]:
gs_tfidf.best_score_

0.7712914834066373

In [20]:
gs_tfidf.score(X_test, y_test)

0.803921568627451

In [22]:
gs_tfidf.best_params_

{'tfidf__max_df': 0.9,
 'tfidf__max_features': 3000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__stop_words': None}

# Extra Trees Classifier

In [35]:
et_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('et', ExtraTreesClassifier(random_state=42))
])

et_pipe_params = {
    'tfidf__max_features': [3000],
    'tfidf__min_df': [1],
    'tfidf__max_df': [.9],
    'tfidf__ngram_range': [(1,1)],
    'tfidf__stop_words': [None],
    'et__n_estimators': [50, 100],
    'et__max_features': [None, 'auto'],
    'et__max_depth': [None, 2, 3, 4]
}

gs_et = GridSearchCV(et_pipe,
                     et_pipe_params,
                     cv=5)

In [36]:
gs_et.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [37]:
gs_et.best_score_

0.7826735972277754

In [38]:
gs_et.score(X_test, y_test)

0.7745098039215687

In [39]:
gs_et.best_params_

{'et__max_depth': None,
 'et__max_features': 'auto',
 'et__n_estimators': 100,
 'tfidf__max_df': 0.9,
 'tfidf__max_features': 3000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__stop_words': None}

### With CountVectorizer

In [59]:
et_cv_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('et', ExtraTreesClassifier(random_state=42))
])

et_cv_pipe_params = {
    'cv__max_features': [2000],
    'cv__min_df': [2],
    'cv__max_df': [.95],
    'cv__ngram_range': [(1,1)],
    'cv__stop_words': [None],
    'et__n_estimators': [50, 100],
    'et__max_features': [None, 'auto'],
    'et__max_depth': [None, 2, 3, 4]
}

gs_et_cv = GridSearchCV(et_cv_pipe,
                        et_cv_pipe_params,
                        cv=5)

In [60]:
gs_et_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cv',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prepro

In [61]:
gs_et_cv.best_score_

0.7746634679461548

In [62]:
gs_et_cv.score(X_test, y_test)

0.8186274509803921

In [63]:
gs_et_cv.best_params_

{'cv__max_df': 0.95,
 'cv__max_features': 2000,
 'cv__min_df': 2,
 'cv__ngram_range': (1, 1),
 'cv__stop_words': None,
 'et__max_depth': None,
 'et__max_features': 'auto',
 'et__n_estimators': 100}

# Random Forest Classifier

In [47]:
rf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier(random_state=42))
])

rf_pipe_params = {
    'tfidf__max_features': [3000],
    'tfidf__min_df': [1],
    'tfidf__max_df': [.9],
    'tfidf__ngram_range': [(1,1)],
    'tfidf__stop_words': [None],
    'rf__n_estimators': [50, 100],
    'rf__max_features': [None, 'auto'],
    'rf__max_depth': [None, 2, 3, 4]
}

gs_rf = GridSearchCV(rf_pipe,
                     rf_pipe_params,
                     cv=5)

In [48]:
gs_rf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [49]:
gs_rf.best_score_

0.7516326802612288

In [50]:
gs_rf.score(X_test, y_test)

0.7745098039215687

In [51]:
gs_rf.best_params_

{'rf__max_depth': None,
 'rf__max_features': 'auto',
 'rf__n_estimators': 50,
 'tfidf__max_df': 0.9,
 'tfidf__max_features': 3000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__stop_words': None}

### With CountVectorizer

In [69]:
rf_cv_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('rf', RandomForestClassifier(random_state=42))
])

rf_cv_pipe_params = {
    'cv__max_features': [2000],
    'cv__min_df': [2],
    'cv__max_df': [.95],
    'cv__ngram_range': [(1,1)],
    'cv__stop_words': [None],
    'rf__n_estimators': [50, 100],
    'rf__max_features': [None, 'auto'],
    'rf__max_depth': [None, 2, 3, 4]
}

gs_rf_cv = GridSearchCV(rf_cv_pipe,
                     rf_cv_pipe_params,
                     cv=5)

In [70]:
gs_rf_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cv',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prepro

In [71]:
gs_rf_cv.best_score_

0.7893909103025456

In [72]:
gs_rf_cv.score(X_test, y_test)

0.7892156862745098

In [73]:
gs_rf_cv.best_params_

{'cv__max_df': 0.95,
 'cv__max_features': 2000,
 'cv__min_df': 2,
 'cv__ngram_range': (1, 1),
 'cv__stop_words': None,
 'rf__max_depth': None,
 'rf__max_features': 'auto',
 'rf__n_estimators': 100}

# SVC

In [305]:
SVC_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('SVC', SVC(random_state=42))
])

SVC_pipe_params = {
    'tfidf__max_features': [3000],
    'tfidf__min_df': [1],
    'tfidf__max_df': [.9],
    'tfidf__ngram_range': [(1,1)],
    'tfidf__stop_words': [None],
    'SVC__C': [1, .1, .01, .001],
    'SVC__degree': [3, 4, 5],
    'SVC__kernel': ['linear'],
    'SVC__probability': [True]
}

gs_SVC = GridSearchCV(SVC_pipe,
                     SVC_pipe_params,
                     cv=5)

In [306]:
gs_SVC.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [307]:
gs_SVC.best_score_

0.7728775156604025

In [308]:
gs_SVC.score(X_test, y_test)

0.8333333333333334

In [309]:
gs_SVC.best_params_

{'SVC__C': 1,
 'SVC__degree': 3,
 'SVC__kernel': 'linear',
 'SVC__probability': True,
 'tfidf__max_df': 0.9,
 'tfidf__max_features': 3000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__stop_words': None}

#### With CountVectorizer

In [147]:
SVC_cv_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('SVC', SVC(random_state=42))
])

SVC_cv_pipe_params = {
    'cv__max_features': [2000],
    'cv__min_df': [3],
    'cv__max_df': [.95],
    'cv__ngram_range': [(1,1)],
    'cv__stop_words': [None],
    'SVC__C': [1],
    'SVC__degree': [3],
    'SVC__kernel': ['linear']
}

gs_SVC_cv = GridSearchCV(SVC_cv_pipe,
                     SVC_cv_pipe_params,
                     cv=5)

In [148]:
gs_SVC_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cv',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prepro

In [149]:
gs_SVC_cv.best_score_

0.7271224843395974

In [150]:
gs_SVC_cv.score(X_test, y_test)

0.7450980392156863

In [151]:
gs_SVC_cv.best_params_

{'SVC__C': 1,
 'SVC__degree': 3,
 'SVC__kernel': 'linear',
 'cv__max_df': 0.95,
 'cv__max_features': 2000,
 'cv__min_df': 3,
 'cv__ngram_range': (1, 1),
 'cv__stop_words': None}

# Guassian Bayes

In [85]:
# adapted from https://stackoverflow.com/questions/28384680/scikit-learns-pipeline-a-sparse-matrix-was-passed-but-dense-data-is-required
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    
    def transform(self, X, y=None, **fit_params):
        return X.toarray()

In [86]:
gauss_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('_', DenseTransformer()),
    ('gauss', GaussianNB())
])

gauss_params = {
    'cv__max_features': [2000, 3000, 4000],
    'cv__min_df': [1, 2, 3],
    'cv__max_df': [.95],
    'cv__ngram_range': [(1,1), (1,2)],
    'cv__stop_words': [None, 'english'],
}

gauss_gs = GridSearchCV(gauss_pipe, gauss_params, verbose=1, n_jobs=-1)

gauss_gs.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   13.4s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cv',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        pre

In [88]:
gauss_gs.best_score_

0.7500333200053312

In [87]:
gauss_gs.score(X_test, y_test)

0.7598039215686274

#### TFIDFVectorizer

In [89]:
gauss_tfidf_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('_', DenseTransformer()),
    ('gauss', GaussianNB())
])

gauss_tfidf_params = {
    'tfidf__max_features': [2000, 3000, 4000],
    'tfidf__min_df': [1, 2, 3],
    'tfidf__max_df': [.95],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__stop_words': [None, 'english'],
}

gauss_gs_tfidf = GridSearchCV(gauss_tfidf_pipe, gauss_tfidf_params, verbose=1, n_jobs=-1)

gauss_gs_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   15.8s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

In [90]:
gauss_gs_tfidf.best_score_

0.7484073037451686

In [91]:
gauss_gs_tfidf.score(X_test, y_test)

0.7647058823529411

# AdaBoost

In [102]:
ADA_cv_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('ADA', AdaBoostClassifier(random_state=42))
])

ADA_cv_pipe_params = {
    'cv__max_features': [2000, 3000, 4000],
    'cv__min_df': [1, 2, 3],
    'cv__max_df': [.95],
    'cv__ngram_range': [(1,1), (1,2)],
    'cv__stop_words': [None, 'english'],
    'ADA__n_estimators': [50, 100]
}

gs_ADA_cv = GridSearchCV(ADA_cv_pipe,
                     ADA_cv_pipe_params,
                     cv=5)

In [104]:
gs_ADA_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cv',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prepro

In [105]:
gs_ADA_cv.best_score_

0.7452485672397708

In [106]:
gs_ADA_cv.score(X_test, y_test)

0.6862745098039216

In [112]:
gs_ADA_cv.best_params_

{'ADA__n_estimators': 100,
 'cv__max_df': 0.95,
 'cv__max_features': 2000,
 'cv__min_df': 1,
 'cv__ngram_range': (1, 2),
 'cv__stop_words': None}

#### TFIDFVector

In [107]:
ADA_pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('ADA', AdaBoostClassifier(random_state=42))
])

ADA_pipe_params = {
    'tfidf__max_features': [2000, 3000, 4000],
    'tfidf__min_df': [1, 2, 3],
    'tfidf__max_df': [.95],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__stop_words': [None, 'english'],
    'ADA__n_estimators': [50, 100]
}

gs_ADA = GridSearchCV(ADA_pipe,
                     ADA_pipe_params,
                     cv=5)

In [108]:
gs_ADA.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [109]:
gs_ADA.best_score_

0.7386378781820605

In [110]:
gs_ADA.score(X_test, y_test)

0.6764705882352942

# Gradient Boosting

In [114]:
gb_cv_pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('gb', GradientBoostingClassifier(random_state=42))
])

gb_cv_pipe_params = {
    'cv__max_features': [2000, 3000, 4000],
    'cv__min_df': [1, 2, 3],
    'cv__max_df': [.95],
    'cv__ngram_range': [(1,1), (1,2)],
    'cv__stop_words': [None, 'english'],
    'gb__n_estimators': [50, 100, 200]
}

gs_gb_cv = GridSearchCV(gb_cv_pipe,
                     gb_cv_pipe_params,
                     cv=5)

In [115]:
gs_gb_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cv',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prepro

In [116]:
gs_gb_cv.best_score_

0.768146074903372

In [122]:
gs_gb_cv.score(X_test, y_test)

0.7254901960784313

In [117]:
gs_gb_cv.best_params_

{'cv__max_df': 0.95,
 'cv__max_features': 2000,
 'cv__min_df': 3,
 'cv__ngram_range': (1, 1),
 'cv__stop_words': None,
 'gb__n_estimators': 200}

#### TFIDFVectorizer

In [118]:
gb_pipe = Pipeline([
    ('tfidf', TfiTfidfVectorizerctorizer()),
    ('gb', GradientBoostingClassifier(random_state=42))
])

gb_pipe_params = {
    'tfidf__max_features': [2000, 3000, 4000],
    'tfidf__min_df': [1, 2, 3],
    'tfidf__max_df': [.95],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'tfidf__stop_words': [None, 'english'],
    'gb__n_estimators': [50, 100, 200]
}

gs_gb = GridSearchCV(gb_pipe,
                     gb_pipe_params,
                     cv=5)

In [119]:
gs_gb.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [120]:
gs_gb.best_score_

0.7550446488071438

In [123]:
gs_gb.score(X_test, y_test)

0.7156862745098039

In [121]:
gs_gb.best_params_

{'gb__n_estimators': 200,
 'tfidf__max_df': 0.95,
 'tfidf__max_features': 2000,
 'tfidf__min_df': 3,
 'tfidf__ngram_range': (1, 2),
 'tfidf__stop_words': None}

# All Scores combined

In [159]:
print('Training Best Score, Logisitic Regression, CVEC:', gs.best_score_)
print('Testing Score, Logisitic Regression, CVEC:', gs.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, Logisitic Regression, TFIDF:', gs_tfidf.best_score_)
print('Testing Score, Logisitic Regression, TFIDF:', gs_tfidf.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, Extra Tree Classifier, CVEC:', gs_et_cv.best_score_)
print('Testing Score, Extra Tree Classifier, CVEC:', gs_et_cv.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, Extra Tree Classifier, TFIDF:', gs_et.best_score_)
print('Testing Score, Extra Tree Classifier, TFIDF:', gs_et.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, Random Forest Classifier, CVEC:', gs_rf_cv.best_score_)
print('Testing Score, Random Forest Classifier, CVEC:', gs_rf_cv.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, Random Forest Classifier, TFIDF:', gs_rf.best_score_)
print('Testing Score, Random Forest Classifier, TFIDF:', gs_rf.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, SVC, CVEC:', gs_SVC_cv.best_score_)
print('Testing Score, SVC, CVEC:', gs_SVC_cv.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, SVC, TFIDF:', gs_SVC.best_score_)
print('Testing Score, Extra SVC, TFIDF:', gs_SVC.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, Gaussian, CVEC:', gauss_gs.best_score_)
print('Testing Score, Gaussian, CVEC:', gauss_gs.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, Gaussian, TFIDF:', gauss_gs_tfidf.best_score_)
print('Testing Score, Gaussian, TFIDF:', gauss_gs_tfidf.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, ADABoost, CVEC:', gs_ADA_cv.best_score_)
print('Testing Score, ADABoost, CVEC:', gs_ADA_cv.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, ADABoost, TFIDF:', gs_ADA.best_score_)
print('Testing Score, ADABoost, TFIDF:', gs_ADA.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, GradientBoost, CVEC:', gs_gb_cv.best_score_)
print('Testing Score, GradientBoost, CVEC:', gs_gb_cv.score(X_test, y_test))
print()
print('='*100)
print()
print('Training Best Score, GradientBoost, TFIDF:', gs_gb.best_score_)
print('Testing Score, GradientBoost, TFIDF:', gs_gb.score(X_test, y_test))

Training Best Score, Logisitic Regression, CVEC: 0.7745168599226976
Testing Score, Logisitic Regression, CVEC: 0.7745098039215687


Training Best Score, Logisitic Regression, TFIDF: 0.7712914834066373
Testing Score, Logisitic Regression, TFIDF: 0.803921568627451


Training Best Score, Extra Tree Classifier, CVEC: 0.7746634679461548
Testing Score, Extra Tree Classifier, CVEC: 0.8186274509803921


Training Best Score, Extra Tree Classifier, TFIDF: 0.7826735972277754
Testing Score, Extra Tree Classifier, TFIDF: 0.7745098039215687


Training Best Score, Random Forest Classifier, CVEC: 0.7893909103025456
Testing Score, Random Forest Classifier, CVEC: 0.7892156862745098


Training Best Score, Random Forest Classifier, TFIDF: 0.7516326802612288
Testing Score, Random Forest Classifier, TFIDF: 0.7745098039215687


Training Best Score, SVC, CVEC: 0.7271224843395974
Testing Score, SVC, CVEC: 0.7450980392156863


Training Best Score, SVC, TFIDF: 0.7728775156604025
Testing Score, Extra SVC, TFIDF: 

The best model out of the models used seems to be the SVC using the TFIDFVectorizer

**Training Score**, SVC, TFIDF: 0.7728775156604025
<br>**Testing Score**, Extra SVC, TFIDF: 0.8333333333333334

In [162]:
print('Training Best Score, SVC, TFIDF:', gs_SVC.best_score_)
print('Testing Score, Extra SVC, TFIDF:', gs_SVC.score(X_test, y_test))
print()

Training Best Score, SVC, TFIDF: 0.7728775156604025
Testing Score, Extra SVC, TFIDF: 0.8333333333333334



# SVC Removing Stopwords

In [191]:
results = gs_SVC.fit(X_train, y_train)

In [193]:
results.score

<bound method BaseSearchCV.score of GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                     

In [242]:
SVC_pipe_test = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('SVC', SVC(random_state=42))
])

SVC_pipe_params_test = {
    'SVC__C': [1],
    'SVC__degree': [3],
    'SVC__kernel': ['linear'],
    'tfidf__max_df': [0.9],
    'tfidf__max_features': [3000],
    'tfidf__min_df': [1],
    'tfidf__ngram_range': [(1, 1)],
    'tfidf__stop_words': ['english']
}

gs_SVC_test = GridSearchCV(SVC_pipe_test,
                     SVC_pipe_params_test,
                     cv=5)

In [243]:
gs_SVC_test.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [244]:
gs_SVC_test.best_score_

0.7597361055577768

In [245]:
gs_SVC_test.score(X_test, y_test)

0.7990196078431373

In [253]:
songs['predictions_with_stopwords'] = gs_SVC_test.predict(songs['clean_lyrics_with_stopwords'])

In [250]:
predictions_stopwords = pd.DataFrame(gs_SVC_test.best_estimator_.steps[1][1].coef_.toarray(),
             columns=gs_SVC_test.best_estimator_.steps[0][1].get_feature_names()).T.sort_values(0)

In [268]:
predictions_stopwords = predictions_stopwords.rename(columns={0: 'coefs_with_stopword'})

In [286]:
predictions_stopwords.head(20)

Unnamed: 0,coefs_with_stopword
song,-1.962134
stay,-1.275087
path,-1.151061
way,-1.140443
inside,-1.137415
belle,-1.1282
goodbye,-1.11772
wish,-1.084684
world,-1.078975
life,-1.0498


In [239]:
results.best_estimator_.steps[1][1].coef_.toarray()

array([[ 0.1248739 , -0.16165751,  0.18334382, ..., -0.13921812,
        -0.53438041, -0.18614825]])

In [274]:
predictions = pd.DataFrame(results.best_estimator_.steps[1][1].coef_.toarray(),
             columns=results.best_estimator_.steps[0][1].get_feature_names()).T.sort_values(0)

In [275]:
predictions = predictions.rename(columns={0: 'coefs'})

In [279]:
predictions

Unnamed: 0,coefs
song,-1.657070
this,-1.557221
see,-1.423257
each,-1.274532
everything,-1.151166
...,...
then,1.315654
love,1.316975
man,1.379970
was,1.450071


In [280]:
predictions_stopwords

Unnamed: 0,coefs_with_stopword
song,-1.962134
stay,-1.275087
path,-1.151061
way,-1.140443
inside,-1.137415
...,...
white,1.326396
baby,1.400892
easy,1.409904
night,1.521041


In [290]:
coef_df = pd.concat([predictions, predictions_stopwords], axis=1)

In [295]:
coef_df.to_csv('datasets/coefs_with_and_without_stopwords.csv')

In [199]:
songs['predictions'] = gs_SVC.predict(songs['clean_lyrics_with_stopwords'])

In [294]:
songs.to_csv('datasets/predictions.csv', index=False)

# Pickles!

In [297]:
import pickle

This was learned from the pickle local lesson in week 8.

In [310]:
with open('SVC.pkl', 'wb') as SCV_out:
    pickle.dump(gs_SVC, SCV_out)