In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import cross_val_score

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

from functools import reduce

import matplotlib.pyplot as plt

In [2]:
data = pd.read_json('../input/annamayya-song-lyrical-map/song_lyric_map.json')
# Labels
lbls = ['Devotional','Romantic']

In [3]:
RAND_STATE = 42
TEST_SIZE = 0.3
WORD_CNT_THRESH = 10
CV_LEVELS = [5,10]

In [4]:
freq = pd.Series(' '.join(data['Lyric']).split()).value_counts()
def filter_low_freq_words(s,f):
    return ' '.join([x for x in s.split() if f[x]>WORD_CNT_THRESH])
data.Lyric = data.Lyric.apply(lambda x:filter_low_freq_words(x,freq))

In [5]:
X=data['Lyric']
y=data['Genre']

In [6]:
count_vector = CountVectorizer()
X_counts = count_vector.fit_transform(X)
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [7]:
def do_grid_search(_X,_y,mdl,params,scoring,key,n=5):
    est = GridSearchCV(mdl,params,scoring=scoring,cv=n,return_train_score=True,refit=key,n_jobs=-1)
    est.fit(_X,_y)
    return est

In [8]:
def plot_grid_results(est,keys,title):
    plt.figure(figsize=[12,3])
    plt.suptitle(title)
    for i,k in enumerate(keys):
        train_score = reduce(lambda a,b: list(a) + list(b), [est.cv_results_[f'split{x}_train_{k}'] for x in range(est.cv)])
        test_score = reduce(lambda a,b: list(a) + list(b), [est.cv_results_[f'split{x}_test_{k}'] for x in range(est.cv)])
        plt.subplot(1,len(keys),i+1)
        plt.plot(train_score)
        plt.plot(test_score)
        plt.legend(['Train','Test'])
        plt.ylim([0,1.1])
        plt.title(k)
    plt.tight_layout()
    plt.show()

In [9]:
scorers = {
    'F1_Score' : make_scorer(f1_score,pos_label=lbls[0]),
    'Precision' : make_scorer(precision_score,pos_label=lbls[0]),
    'Recall' : make_scorer(recall_score,pos_label=lbls[0]),
          }

In [26]:
Models = {
    'Logistic Regression' : (LogisticRegression(random_state=RAND_STATE),{'tol':[1e-3,1e-4,1e-5]}),
    'Multi Nomial Naive Bayes Classification': (MultinomialNB(),{'alpha':[1e-3,1e-4,1e-5]}),
    'Bernoulli Naive Bayes Classification': (BernoulliNB(),{'alpha':[1e-3,1e-4,1e-5]}),
    'Complement Naive Bayes Classification': (ComplementNB(),{'alpha':[1e-3,1e-4,1e-5]}),
    'Nearest Centroid Classification': (NearestCentroid(),{}),
    'Random Forest Classifier': (RandomForestClassifier(random_state=RAND_STATE),{'min_samples_leaf':[2,5,10]}),
    'Ridge Classifier': (RidgeClassifier(random_state=RAND_STATE), {'alpha':[1e-3,1e-4,1e-5]}),
    'SGD Classifier': (SGDClassifier(random_state=RAND_STATE),{'alpha':[1e-3,1e-4,1e-5]}),
    'Linear SVC': (LinearSVC(random_state=RAND_STATE),{'tol':[1e-2,1e-4,1e-1]}),
    'XGB Classifier': (XGBClassifier(use_label_encoder=True),{'eta':[0.1,0.3,0.5],'objective':['binary:logistic']}),
}

In [12]:
def run_nfold(n, Models):
    Res = {}
    for k in Models:
        print(f'Model {k}')
        Res[k] = do_grid_search(X_tfidf,y,Models[k][0],Models[k][1],scorers,'F1_Score',n)
        print('Best F1 Score : ' ,100*Res[k].best_score_, '%')
        plot_grid_results(Res[k],list(scorers.keys()),f'{n} Fold {k}')
    return Res

In [27]:
Results = {}
Results['5Fold'] = run_nfold(5, Models)

In [28]:
Results['10Fold'] = run_nfold(10, Models)

In [21]:
def convert_grid_search_to_df(nfold_result, name):
    res = {'Model':[],
           f'{name} Train F1 Score':[],
           f'{name} Test F1 Score':[],
           f'{name} Train Precision':[],
           f'{name} Test Precision':[],
           f'{name} Train Recall':[],
           f'{name} Test Recall':[],
           'Best Parameters':[]}
    for k in Models:
        res['Model'].append(k)
        iii = nfold_result[k].best_index_
        res[f'{name} Train F1 Score'].append(nfold_result[k].cv_results_['mean_train_F1_Score'][iii])
        res[f'{name} Test F1 Score'].append(nfold_result[k].cv_results_['mean_test_F1_Score'][iii])
        res[f'{name} Train Precision'].append(nfold_result[k].cv_results_['mean_train_Precision'][iii])
        res[f'{name} Test Precision'].append(nfold_result[k].cv_results_['mean_test_Precision'][iii])
        res[f'{name} Train Recall'].append(nfold_result[k].cv_results_['mean_train_Recall'][iii])
        res[f'{name} Test Recall'].append(nfold_result[k].cv_results_['mean_test_Recall'][iii])
        res['Best Parameters'].append(nfold_result[k].best_params_)
    df = pd.DataFrame(data=res)
    df.sort_values(by=f'{name} Test F1 Score', ascending=False, inplace=True)
    return df

In [29]:
fold_5 = convert_grid_search_to_df(Results['5Fold'], '5 Fold')
fold_10 = convert_grid_search_to_df(Results['10Fold'], '10 Fold')

In [30]:
fold_5

In [31]:
fold_10