# Multinominal Naive Bayes Tryouts

In [14]:
import pandas as pd
import joblib
#from GenreGuesser.model_select import gg_cross_val
#from GenreGuesser.model_select import gg_single_split_test
#from GenreGuesser.model_select import gg_grid_search
#from GenreGuesser.model_select import train_test_split
#from GenreGuesser.pipeline import get_knn_pipe
#from GenreGuesser.svm_pipeline import get_svm_pipe
#from GenreGuesser.rfc_pipeline import get_rfc_pipe
#from GenreGuesser.nbc_pipeline import get_nbc_pipe
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
#from GenreGuesser.params import GENRE_DICT, DATA_SOURCE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
#from GenreGuesser.text_preproc import clean_text
import re
import string
import nltk
nltk.download('stopwords', quiet = True)
nltk.download('punkt', quiet = True)
nltk.download('wordnet', quiet = True)
nltk.download('omw-1.4', quiet = True)
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn import set_config
set_config(display="diagram")
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Load full dataset

In [22]:
# Dictionary for translating from MusicBrainz genre code to English
GENRE_DICT = {
    '100' : 'rap',
    100 : 'rap',
    '73' : 'pop',
    73 : 'pop',
    '38' : 'country',
    38 : 'country',
    '114' : 'rock',
    114 : 'rock',
    '57' : 'folk',
    57 : 'folk',
    '62' : 'jazz',
    62 : 'jazz',
    'smooth-jazz': 'jazz',
    }

# Read in the data, clean it, and restrict to the right genres
data = pd.read_csv('/Users/julia_welch/code/GenreGuesser/raw_data/full_df.csv')
data['Genre'] = data['Genre'].apply(lambda x : GENRE_DICT[x] if x in GENRE_DICT.keys() else x)
data = data.dropna()
data = data[data['Genre'].isin(['rap', 'country', 'rock', 'pop'])]

# Set the X and y values accordingly.
# X values are just strings of lyrics (will be vectorized in pipeline),
# y values are strings indicating a genre.
X = data[['Lyrics']]
y = data['Genre']

# Uncomment the following lines to undersample pop, rock, and country to 1800
rus = RandomUnderSampler(random_state = 42)
X, y = rus.fit_resample(X,y)
X = X['Lyrics']

# Split 80-20 into training/validation and test data (reliably with a fixed random state)
X_tv, X_test, y_tv, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

# Uncomment the following line to see how many songs from each genre are in the data set
print(y.value_counts())

# Uncomment the following line to see the proportion of songs from each genre in the
# data set
print(y.value_counts(normalize = True))

country    1325
pop        1325
rap        1325
rock       1325
Name: Genre, dtype: int64
country    0.25
pop        0.25
rap        0.25
rock       0.25
Name: Genre, dtype: float64


# NBC Pipeline

## Preprocessing

In [23]:
def clean_text(text):
    #remove 'е'
    text = text.replace('е', 'e')

    #remove headers like [Chorus] etc
    headers = re.findall(r"\[(.*?)\]", text)
    for header in headers:
        text = text.replace(f'[{header}]', ' ')

    #separate lower/upper case words (like 'needHow')
    cap_sep_find = r'([a-z])([A-Z])'
    cap_sep_replace = r'\1 \2'
    text = re.sub(cap_sep_find, cap_sep_replace, text)

    #remove punctuation
    exclude = string.punctuation + "’‘”“"
    for punctuation in exclude:
           text = text.replace(punctuation, ' ')

    #turn text into lowercase
    text = text.lower()

    #remove numericals
    text = ''.join(word for word in text if not word.isdigit())

    #remove stopwords
    stop_words = set(stopwords.words('english'))

    #tokenise
    word_tokens = word_tokenize(text)
    text = [w for w in word_tokens if not w in stop_words]

    #lemmatise
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in text]
    text = lemmatized

    #filter out non-ascii words
    words_set = set(words.words())
    safe_set = set(['cliché', 'rosé', 'déjà', 'ménage',  'yoncé', 'beyoncé', 'café', 'crème', 'señor', 'señorita'])
    ascii_list = []
    for word in text:
        if word in words_set or word.isascii() or word in safe_set:
            ascii_list.append(word)
    text = ' '.join(ascii_list)

    #rejoin "wan na"/"gon na" to "wanna"/"gonna"
    wannas = re.findall(r"wan na", text)
    gonnas = re.findall(r"gon na", text)
    gottas = re.findall(r"got ta", text)

    for wanna in wannas:
        text = text.replace(wanna, "wanna")

    for gonna in gonnas:
        text = text.replace(gonna, "gonna")

    for gotta in gottas:
        text = text.replace(gotta, "gotta")

    return text

In [24]:
def format_func(X_in):
    '''
    Transformer function that cleans text,
    integrated as first step of pipeline
    '''
    X_out = X_in.apply(clean_text)
    return X_out

# Create Pipeline, which has the following three steps:
#   - Clean text (remove things like '[VERSE 1]', lemmatize, etc.)
#   - TF-IDF Vectorize
#   - Gaussian/MultinomialNB NaiveBayes classifier
def get_nbc_pipe():
    format_transform = FunctionTransformer(format_func)
    nbc_pipe = Pipeline([
        ('format_transform', format_transform),
        ('tfidf', TfidfVectorizer()),
        ('nbc', MultinomialNB()),
    ])

    return nbc_pipe

In [25]:
pipe = get_nbc_pipe()
pipe

## Train Pipeline

In [26]:
%%time
pipe.fit(X_tv, y_tv)

CPU times: user 9min 53s, sys: 43.8 s, total: 10min 37s
Wall time: 10min 49s


In [27]:
# Make predictions
pipe.predict_proba(X_test.iloc[0:2])

array([[0.30505371, 0.22014876, 0.17104121, 0.30375633],
       [0.22742592, 0.15824298, 0.51790435, 0.09642676]])

In [28]:
%%time
# Score model
pipe.score(X_test,y_test)

CPU times: user 2min 13s, sys: 12.1 s, total: 2min 25s
Wall time: 2min 28s


0.6452830188679245

## Cross-validate Pipeline

In [29]:
%%time
# Cross validate pipeline
cross_val_score(pipe, X_tv, y_tv, cv=5, scoring='accuracy').mean()

CPU times: user 50min 15s, sys: 4min 33s, total: 54min 48s
Wall time: 56min 5s


0.5962264150943396

## GridSearch Pipeline

In [30]:
# Inspect all pipe components parameters to find the one you want to gridsearch
pipe.get_params()

{'memory': None,
 'steps': [('format_transform',
   FunctionTransformer(func=<function format_func at 0x139e68280>)),
  ('tfidf', TfidfVectorizer()),
  ('nbc', MultinomialNB())],
 'verbose': False,
 'format_transform': FunctionTransformer(func=<function format_func at 0x139e68280>),
 'tfidf': TfidfVectorizer(),
 'nbc': MultinomialNB(),
 'format_transform__accept_sparse': False,
 'format_transform__check_inverse': True,
 'format_transform__func': <function __main__.format_func(X_in)>,
 'format_transform__inv_kw_args': None,
 'format_transform__inverse_func': None,
 'format_transform__kw_args': None,
 'format_transform__validate': False,
 'tfidf__analyzer': 'word',
 'tfidf__binary': False,
 'tfidf__decode_error': 'strict',
 'tfidf__dtype': numpy.float64,
 'tfidf__encoding': 'utf-8',
 'tfidf__input': 'content',
 'tfidf__lowercase': True,
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 1),
 'tfidf__norm': 'l2',
 'tfidf__preprocessor': No

ERROR! Session/line number was not unique in database. History logging moved to new session 384


In [31]:
#%%time
# Instanciate grid search
#grid_search = GridSearchCV(
#    pipe, 
#    param_grid={
#        # Access any component of the pipeline, as far back as you want
#        'tfidf__max_df': [0.5, 0.8, 1.0],
#        'tfidf__min_df': [0.5, 0.8, 1.0],
#        'tfidf__ngram_range': ((1,1), (2,2)),
#        'nbc__alpha': [0.1, 0.5, 1]},
#    n_jobs=-1,
#    verbose=1,
#    cv=5,
#    refit=True,
#    scoring="accuracy")
#
#grid_search.fit(X_tv, y_tv)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_