In [28]:
from shared import const
from shared import utils
import numpy as np
import pandas as pd
import re

# Gensim
import gensim
from gensim import models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess, lemmatize
from gensim.models import CoherenceModel, LdaModel, LdaMulticore

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
#matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

#Choose Random forest for multiclass classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score

In [29]:
#prepare stopwords
from nltk.corpus import stopwords

stop_words = stopwords.words("english")

#for the purpose of emotion classification, we trim the words by extending stopwords:
stop_words.extend(["from", "re", "use", "day", "let", "get","say", "know", "think", "love"])

In [30]:
#preproccesing

#remove words fewer than 3 characters
def remove_words(list2d_words, threshold = 3):
    '''
    return a 2d list of words
    '''
    words_more_than_3chars = []
    for i in range(len(list2d_words)):
        proc_sentence = []
        for j in range(len(list2d_words[i])):
            if len(list2d_words[i][j]) >= threshold:
                proc_sentence.append(list2d_words[i][j])
        words_more_than_3chars.append(proc_sentence)
    return words_more_than_3chars
        

#remove stopwords
#simple_preprocess will remove punctuations and unnecessary characters altogether
def remove_stopwords(list2d_words):
    return [[word for word in simple_preprocess(" ".join(doc)) if word not in stop_words] for doc in list2d_words]

#lemmatize: word in third person changed to first person, etc, and only keep the nouns, adj, verb, adv
nlp = spacy.load('en', disable = ['parse', 'ner'])

def my_lemmatize(list2d_words, allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in list2d_words:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [31]:
#tokenize each document into a list of words, removing punctuations and unnecessary characters altogether.
def sentences_to_words(sentences):
    yield(gensim.utils.simple_preprocess(str(sentences), deacc=True))  # deacc=True removes punctuations

import itertools

def corpus_to_2d_words(corpus):
    word_corpus = []
    for i in range(len(corpus)):
        data_words = list(sentences_to_words(corpus[i]))
        word_corpus = [*word_corpus, *data_words]
    return word_corpus

def words_2d_to_dict(word_2d_list):
    dictionary = corpora.Dictionary(word_2d_list)
    return dictionary

def df_to_lemmatized_words(df):
    words_2d = corpus_to_2d_words(df)
    words_morethan_3chars = remove_words(words_2d)
    words_nostops = remove_stopwords(words_morethan_3chars)
    lemmatized_words = my_lemmatize(words_nostops)
    return lemmatized_words

def lemmatized_to_corpus(words_2d):
    dict_words = words_2d_to_dict(words_2d)
    corpus = [dict_words.doc2bow(simple_preprocess(" ".join(line))) for line in words_2d]
    
def build_LDA_model(dict_words, corpus):
    lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus,
                                          id2word = dict_words,
                                          num_topics = NUM_TOPICS, #tried 3, 4, 5, 6, 10 , 20. seems 3 for coherence is the best, the second is 4
                                          random_state = 100, #the seed 
                                          update_every = 0, #batch mode
                                          chunksize = 1000, #larger the heavier workload for the memory
                                          passes = 10,#how to determine the chunksize and passes?relevant to size of dataset?
                                          alpha = 'auto',
                                          minimum_probability = 0.0,
                                          per_word_topics = False #non-indicative words would be omitted
                                          )
    return lda_model
    
#evaluate by perplexity and coherence
def print_coherence_perplexity(the_model, lemmatized_words, dict_words, corpus, f=None):
    coherence_model = CoherenceModel(model = the_model, texts = lemmatized_words, dictionary = dict_words, coherence = 'c_v')
    coherence_lda = coherence_model.get_coherence()
    #compute model coherence: higher the better:
    print('\nCoherence Score: ', coherence_lda, file = f)
    #compute model perplexity: lower the better:
    print('\nPerplexity: ', the_model.log_perplexity(corpus), file = f)
    
#get the LDA generated topic-document dataset and responding label for classification
def generate_data(model, corpus):
    topics_features = []
    for i in range(len(corpus)):
        row = []
        for j in range(NUM_TOPICS):
            row.append(model.get_document_topics(corpus[i])[j][1])
        topics_features.append(row)
    #generate data for supervised classification
    print("length of topics_features is : {}".format(len(topics_features)))
    gen_df_mid = pd.DataFrame(topics_features)
    return gen_df_mid

def generate_train_test(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
    return X_train, X_test, y_train, y_test

#z-score normalization
def zscore_normalization(X_train, X_test):
    zscoreScaler = StandardScaler().fit(X_train)
    X_tr_std = zscoreScaler.transform(X_train)
    X_ts_std = zscoreScaler.transform(X_test)
    return X_tr_std, X_ts_std

def RF_classify_score(X_tr_std, X_ts_std, y_train, y_test):
    rf_clf = RandomForestClassifier()
    rf_clf.fit(X_tr_std, y_train)
    rf_pred = rf_clf.predict(X_ts_std)
    score = accuracy_score(y_test, rf_pred)
    print(score)
    return score

def complement_classify_score(X_train, X_test, y_train, y_test):
    cnb = ComplementNB(alpha = 0.1)
    cnb.fit(X_train, y_train)
    cnb_pred = cnb.predict(X_test)
    score = accuracy_score(y_test, cnb_pred)
    print(score)
    return score  
    
def randomForest_LDA(X, Y):
    X1_train, X1_test, y1_train, y1_test = generate_train_test(X,Y)
    #z-score normalization
    X1_tr_std, X1_ts_std = zscore_normalization(X1_train, X1_test)
    RF_classify_score(X1_tr_std, X1_ts_std, y1_train, y1_test)
    
def complementNB_LDA(X, Y):
    X_train, X_test, y_train, y_test = generate_train_test(X,Y)
    #X_tr_std, X_ts_std = zscore_normalization(X_train, X_test)
    complement_classify_socre(X_train, X_test, y_train, y_test)


In [32]:
# Create the TF-IDF model
def build_tfidf_model(corpus):
    tfidf = models.TfidfModel(corpus, smartirs='ntc')
    corpus_tfidf = tfidf[corpus]
    return corpus_tfidf

In [46]:
 #use k-fold validation:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
kFolder = KFold(n_splits = 10, shuffle = True) #10-fold cross-validation

def kfold_classification(X, Y, clf, normalization_func = None, f = None):
    sum_accuracy = []
    for k, (train, test) in enumerate(kFolder.split(X, Y)):
        X_train = np.array(X.iloc[train, :])
        X_test = np.array(X.iloc[test, :])
        y_train = np.array(Y.iloc[train])
        #print(y_train.shape)
        y_test = np.array(Y.iloc[test])
        if normalization_func != None:
            X_train, X_test = normalization_func(X_train, X_test)
        #rf_clf = RandomForestClassifier()
        clf.fit(X_train, y_train)
        rf_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, rf_pred)
        sum_accuracy.append(accuracy)
        #print("the {} th fold split got accuracy {}; f1-score: {} ".format(k, accuracy, f1_score(y_test, rf_pred)), file = f)
    k_fold_avg = sum(sum_accuracy)/len(sum_accuracy)
    print("the average of 10 rounds is {}".format(k_fold_avg), file = f)
    return k_fold_avg

#plot topic distribution for each class: add up the sum of portion of each topic
def plot_topics_per_class(df, gen_df_mid):
    pos = df[df['y'] == 1].index.values.astype(int)
    topics = [0 for x in len(gen_df_mid[0])]
    plot_x = [x+1 for x in len(gen_df_mid[0])]
    for i in pos:
        for j in range(len(gen_df_mid[1])): #every topic
            topics[j] = topics[j] + gen_df_mid[i][j]# j+1?
    plt.plot(plot_x, topics)
    plt.xlabel("topic ID")
    plt.ylabel("topic portion in the class")
    plt.show()
            
         
# df = pd.read_csv(const.CLEAN_UNEVEN_SPOTIFY)


[   4    6    9 ... 4928 4932 4934]


In [47]:
#Build LDA model
data_file = [const.CLEAN_UNEVEN_SPOTIFY, const.CLEAN_UNEVEN_DEEZER]
for every in data_file:
    for i in range(3, 9):
        name = str(every).split('/')[-1].split('.')[0]
        f = open('uneven_spotify_' + name + '_count.txt', 'a')
        NUM_TOPICS = i
        df_full = pd.read_csv(every)
        #print(df_full.shape)
        df_1 = utils.get_class_based_data(df_full, 1, limit_size = True, include_other_classes=True) #
        print(df_1.head(2))
        print("df_1.y.unique() is ", df_1.y.unique())
        df_2 = utils.get_class_based_data(df_full, 2, limit_size = True, include_other_classes=True)
        df_3 = utils.get_class_based_data(df_full, 3, limit_size = True, include_other_classes=True)
        df_4 = utils.get_class_based_data(df_full, 4, limit_size = True, include_other_classes=True)
        df_1.reset_index()
        df_2.reset_index()
        df_3.reset_index()
        df_4.reset_index()
        Y1 = df_1.iloc[:, -1]
        print("length of y1 is :")
        print(len(Y1))
        Y2 = df_2.iloc[:, -1]
        print("length of y2 is : {}".format(len(Y2)))
        Y3 = df_3.iloc[:, -1]
        Y4 = df_4.iloc[:, -1]
        train_dflist_1 = df_1.lyrics[:].tolist()
        print("length of train_dflist_1 is : {}".format(len(train_dflist_1)))
        train_dflist_2 = df_2.lyrics[:].tolist()
        train_dflist_3 = df_3.lyrics[:].tolist()
        train_dflist_4 = df_4.lyrics[:].tolist()

        lemmatized_words_1 = df_to_lemmatized_words(train_dflist_1)
        print("length of lemmatized_words_1 is : {}".format(len(lemmatized_words_1)))
        lemmatized_words_2 = df_to_lemmatized_words(train_dflist_2)
        lemmatized_words_3 = df_to_lemmatized_words(train_dflist_3)
        lemmatized_words_4 = df_to_lemmatized_words(train_dflist_4)

    #print(lemmatized_words)

    #turn the 2d word list to dict 
        dict_words_1 = words_2d_to_dict(lemmatized_words_1)
        print("length of dict_words_1 is : {}".format(len(dict_words_1)))
        dict_words_2 = words_2d_to_dict(lemmatized_words_2)
        dict_words_3 = words_2d_to_dict(lemmatized_words_3)
        dict_words_4 = words_2d_to_dict(lemmatized_words_4)
    #print(dict_words)

    #there's 2 input for lDA: dictinary and the corpus
    #create bag of words
        corpus_1 =  [dict_words_1.doc2bow(simple_preprocess(" ".join(line))) for line in lemmatized_words_1]
        print("length of corpus_1 is : {}".format(len(corpus_1)))
        corpus_2 =  [dict_words_2.doc2bow(simple_preprocess(" ".join(line))) for line in lemmatized_words_2]
        corpus_3 =  [dict_words_3.doc2bow(simple_preprocess(" ".join(line))) for line in lemmatized_words_3]
        corpus_4 =  [dict_words_4.doc2bow(simple_preprocess(" ".join(line))) for line in lemmatized_words_4]

        
        lda_model_1 = build_LDA_model(dict_words_1, corpus_1)
        lda_model_2 = build_LDA_model(dict_words_2, corpus_2)

        lda_model_3 = build_LDA_model(dict_words_3, corpus_3)

        lda_model_4 = build_LDA_model(dict_words_4, corpus_4)
        print("---------------------start of topic # {}".format(i), "-------------------", file = f)
#         print_coherence_perplexity(lda_model_1, lemmatized_words_1, dict_words_1, corpus_1)
    
#         print_coherence_perplexity(lda_model_2, lemmatized_words_2, dict_words_2, corpus_2)

#         print_coherence_perplexity(lda_model_3, lemmatized_words_3, dict_words_3, corpus_3)

#         print_coherence_perplexity(lda_model_4, lemmatized_words_4, dict_words_4, corpus_4)

        gen_df_mid_1 = generate_data(lda_model_1, corpus_1)
        print("length of gen_df_mid_1 is : {}".format(len(gen_df_mid_1)))
        gen_df_mid_2 = generate_data(lda_model_2, corpus_2)
        gen_df_mid_3 = generate_data(lda_model_3, corpus_3)
        gen_df_mid_4 = generate_data(lda_model_4, corpus_4)
        plot_topics_per_class(df_1, gen_df_mid_1)
        plot_topics_per_class(df_2, gen_df_mid_2)
        plot_topics_per_class(df_3, gen_df_mid_3)
        plot_topics_per_class(df_4, gen_df_mid_4)
    #randomForest_LDA(gen_df_mid_1, df_1.iloc[:TRAIN_SAMPLE, -1])

    #kfold for class 1:
        print("random forest classificaton:")
        rf_clf = RandomForestClassifier()
        rf_score_sum = kfold_classification(gen_df_mid_1, Y1, rf_clf, zscore_normalization)
    
    #randomForest_LDA(gen_df_mid_2, df_2.iloc[:TRAIN_SAMPLE, -1])
    #kfold for class 2:
    #random forest classificaton:
        rf_clf = RandomForestClassifier()
        rf_score_sum = rf_score_sum + kfold_classification(gen_df_mid_2, Y2, rf_clf, zscore_normalization)
    
    #randomForest_LDA(gen_df_mid_3, df_3.iloc[:TRAIN_SAMPLE, -1])
    #kfold for class 3:
    #random forest classificaton:
        rf_clf = RandomForestClassifier()
        rf_score_sum = rf_score_sum + kfold_classification(gen_df_mid_3, Y3, rf_clf, zscore_normalization)
    
    #randomForest_LDA(gen_df_mid_4, df_4.iloc[:TRAIN_SAMPLE, -1])
    #kfold for class 4:
    #random forest classificaton:
        rf_clf = RandomForestClassifier()
        rf_score_sum = rf_score_sum + kfold_classification(gen_df_mid_4, Y4, rf_clf, zscore_normalization)
        rf_score_avg = rf_score_sum / 4
        print("topics: {}, RF score average of 4 models: {}".format(i, rf_score_avg), file = f)

        print("complement naive bayes classifier:")
        cnb_clf = ComplementNB(alpha= 0.1)
        cnb_score_sum = kfold_classification(gen_df_mid_1, Y1, cnb_clf)
    
    #complement naive bayes classifier:
        cnb_clf = ComplementNB(alpha= 0.1)
        cnb_score_sum = cnb_score_sum + kfold_classification(gen_df_mid_2, Y2, cnb_clf)

    #complement naive bayes classifier:
        cnb_clf = ComplementNB(alpha= 0.1)
        cnb_score_sum = cnb_score_sum + kfold_classification(gen_df_mid_3, Y3, cnb_clf)
    
    #complement naive bayes classifier:
        cnb_clf = ComplementNB(alpha= 0.1)
        cnb_score_sum = cnb_score_sum + kfold_classification(gen_df_mid_4, Y4, cnb_clf)
        cnb_score_avg = cnb_score_sum / 4
        print("Complement NB score average of 4 models: {}".format(cnb_score_avg), file = f)
        print("---------------------end of topic # {}".format(i), "-------------------", file = f)
        f.flush()
    f.close()

                  song         artist  valence  arousal  \
0              Changes  Faul & Wad Ad    0.255    0.704   
1  Coat of Many Colors   Dolly Parton    0.681    0.257   

                                              lyrics  y  
0  baby i don't know just why i love you so maybe... -1  
1  back through the years i go wondering once aga... -1  
df_1.y.unique() is  [-1  1]
length of y1 is :
642
length of y2 is : 642
length of train_dflist_1 is : 642
length of lemmatized_words_1 is : 642
length of dict_words_1 is : 6925
length of corpus_1 is : 642
length of topics_features is : 642
length of gen_df_mid_1 is : 642
length of topics_features is : 642
length of topics_features is : 642
length of topics_features is : 642
[  4   6   8  10  11  12  13  16  21  22  23  26  27  28  29  32  35  36
  40  42  44  45  46  47  49  50  51  58  61  62  63  64  65  69  70  71
  74  76  78  82  83  86  88  95  96  99 102 105 107 108 109 110 111 113
 115 116 118 120 121 123 125 128 129 131 134 136 141



the average of 10 rounds is 0.4875
the average of 10 rounds is 0.5654086538461538




the average of 10 rounds is 0.48596153846153844
complement naive bayes classifier:
the average of 10 rounds is 0.5450721153846153
the average of 10 rounds is 0.5187980769230769
the average of 10 rounds is 0.5635817307692308
the average of 10 rounds is 0.48901442307692305
     song  artist  valence  arousal  \
0     You   Brika    0.389    0.431   
1  Sexual  NEIKED    0.809    0.569   

                                              lyrics  y  
0  here we go again classic independent woman tal... -1  
1  you got that thing that i been looking for bee...  1  
df_1.y.unique() is  [-1  1]
length of y1 is :
642
length of y2 is : 642
length of train_dflist_1 is : 642
length of lemmatized_words_1 is : 642
length of dict_words_1 is : 7093
length of corpus_1 is : 642
length of topics_features is : 642
length of gen_df_mid_1 is : 642
length of topics_features is : 642
length of topics_features is : 642
length of topics_features is : 642
[  1   2   3   6   8  11  13  14  15  18  19  20  21  22  2



the average of 10 rounds is 0.5451682692307692
the average of 10 rounds is 0.5404807692307692




the average of 10 rounds is 0.5326442307692307
complement naive bayes classifier:
the average of 10 rounds is 0.537451923076923
the average of 10 rounds is 0.47814903846153844
the average of 10 rounds is 0.5311538461538461
the average of 10 rounds is 0.5250240384615384
                  song       artist  valence  arousal  \
0  You've Got a Friend  Carole King    0.458    0.263   
1               Moscow       Jaykae    0.167    0.648   

                                              lyrics  y  
0  when you're down and troubled and you need som... -1  
1  you don't do that what you talkin' shit for yo... -1  
df_1.y.unique() is  [-1  1]
length of y1 is :
642
length of y2 is : 642
length of train_dflist_1 is : 642
length of lemmatized_words_1 is : 642
length of dict_words_1 is : 6998
length of corpus_1 is : 642
length of topics_features is : 642
length of gen_df_mid_1 is : 642
length of topics_features is : 642
length of topics_features is : 642
length of topics_features is : 642
[  2   



the average of 10 rounds is 0.5670673076923076
the average of 10 rounds is 0.5798076923076924




the average of 10 rounds is 0.5201923076923076
complement naive bayes classifier:
the average of 10 rounds is 0.5203846153846154
the average of 10 rounds is 0.5575240384615385
the average of 10 rounds is 0.506201923076923
the average of 10 rounds is 0.4843509615384615
           song      artist  valence  arousal  \
0          Mine       Bazzi    0.717    0.789   
1  Sinking Ship  Wild Child    0.181    0.174   

                                              lyrics  y  
0  huh yeah naw i just had a lil' bit too much of...  1  
1  feel like wasting time feel like wasting time ... -1  
df_1.y.unique() is  [ 1 -1]
length of y1 is :
642
length of y2 is : 642
length of train_dflist_1 is : 642
length of lemmatized_words_1 is : 642
length of dict_words_1 is : 7101
length of corpus_1 is : 642
length of topics_features is : 642
length of gen_df_mid_1 is : 642
length of topics_features is : 642
length of topics_features is : 642
length of topics_features is : 642
[  0   2   3   5   7   8  10  15



the average of 10 rounds is 0.5294711538461538
the average of 10 rounds is 0.5233413461538461




the average of 10 rounds is 0.5233413461538461
complement naive bayes classifier:
the average of 10 rounds is 0.5220192307692308
the average of 10 rounds is 0.5450721153846153
the average of 10 rounds is 0.5545913461538461
the average of 10 rounds is 0.5390625
               song        artist  valence  arousal  \
0  Missing My Idols  Trippie Redd    0.528    0.637   
1       High Enough  Damn Yankees    0.232    0.537   

                                              lyrics  y  
0  uh take a vivid picture no photos implement th...  1  
1  i don't wanna hear about it anymore it's a sha... -1  
df_1.y.unique() is  [ 1 -1]
length of y1 is :
642
length of y2 is : 642
length of train_dflist_1 is : 642
length of lemmatized_words_1 is : 642
length of dict_words_1 is : 7071
length of corpus_1 is : 642
length of topics_features is : 642
length of gen_df_mid_1 is : 642
length of topics_features is : 642
length of topics_features is : 642
length of topics_features is : 642
[  0   2   3   5   7  



the average of 10 rounds is 0.525
the average of 10 rounds is 0.5639903846153846




the average of 10 rounds is 0.5281009615384615
complement naive bayes classifier:
the average of 10 rounds is 0.5295913461538462
the average of 10 rounds is 0.5154567307692307
the average of 10 rounds is 0.573076923076923
the average of 10 rounds is 0.4813221153846154
      song        artist  valence  arousal  \
0   Recess  Golden Coast    0.658    0.832   
1  Tip Toe  Jason Derulo    0.620    0.709   

                                              lyrics  y  
0  i can hear the sandbox call my name wanna feel...  1  
1  derulo whine fa me darlin' way you move ya spi...  1  
df_1.y.unique() is  [ 1 -1]
length of y1 is :
642
length of y2 is : 642
length of train_dflist_1 is : 642
length of lemmatized_words_1 is : 642
length of dict_words_1 is : 6753
length of corpus_1 is : 642
length of topics_features is : 642
length of gen_df_mid_1 is : 642
length of topics_features is : 642
length of topics_features is : 642
length of topics_features is : 642
[  0   1   2   4   6   7   9  10  14  16 



the average of 10 rounds is 0.4860096153846154
the average of 10 rounds is 0.5185336538461538




the average of 10 rounds is 0.5997355769230769
the average of 10 rounds is 0.5748076923076922
complement naive bayes classifier:
the average of 10 rounds is 0.5483173076923077
the average of 10 rounds is 0.5014423076923077
the average of 10 rounds is 0.5637259615384616
the average of 10 rounds is 0.5203125
       song        artist   valence   arousal  \
0  The Road   Tenacious D  1.178325  1.183311   
1  The Heat  Toni Braxton  0.815393  0.662457   

                                              lyrics found_song  found_artist  \
0  the road is fuckin' hard the road is fuckin' t...   The Road   Tenacious D   
1  where's the sand i'm set my body thumping goin...   The Heat  Toni Braxton   

   y  
0  1  
1  1  
df_1.y.unique() is  [ 1 -1]
length of y1 is :
2114
length of y2 is : 2114
length of train_dflist_1 is : 2114
length of lemmatized_words_1 is : 2114
length of dict_words_1 is : 11698
length of corpus_1 is : 2114
length of topics_features is : 2114
length of gen_df_mid_1 is : 2114



the average of 10 rounds is 0.49857372797996957




the average of 10 rounds is 0.48912188142716617




the average of 10 rounds is 0.5303049271215237




the average of 10 rounds is 0.5156420459626219
complement naive bayes classifier:
the average of 10 rounds is 0.5165474380756506
the average of 10 rounds is 0.5203433783421264
the average of 10 rounds is 0.5028681927926317
the average of 10 rounds is 0.5150898685504784
               song       artist   valence   arousal  \
0  If I Had A Heart    Fever Ray  0.032224 -0.512921   
1    Beautiful Mess  Diamond Rio -1.935250 -0.655810   

                                              lyrics        found_song  \
0  this will never end 'cause i want more more gi...  If I Had a Heart   
1  going out of my mind these days like i'm walki...    Beautiful Mess   

  found_artist  y  
0    Fever Ray -1  
1  Diamond Rio -1  
df_1.y.unique() is  [-1  1]
length of y1 is :
2114
length of y2 is : 2114
length of train_dflist_1 is : 2114
length of lemmatized_words_1 is : 2114
length of dict_words_1 is : 11204
length of corpus_1 is : 2114
length of topics_features is : 2114
length of gen_df_mid_1 is : 211



the average of 10 rounds is 0.5155883930966646




the average of 10 rounds is 0.5269404453187874




the average of 10 rounds is 0.532191719574354




the average of 10 rounds is 0.47964097290530266
complement naive bayes classifier:
the average of 10 rounds is 0.5288875972458195
the average of 10 rounds is 0.5444961101672181
the average of 10 rounds is 0.5401859966019852
the average of 10 rounds is 0.5208128409192525
         song           artist   valence   arousal  \
0  Square Biz      Teena Marie  0.815393  0.662457   
1     Blister  Jimmy Eat World  0.979122  0.588708   

                                              lyrics  found_song  \
0  hee everybody get up   flash back who's that d...  Square Biz   
1  take advantage of these times you said you let...     Blister   

      found_artist  y  
0      Teena Marie  1  
1  Jimmy Eat World  1  
df_1.y.unique() is  [ 1 -1]
length of y1 is :
2114
length of y2 is : 2114
length of train_dflist_1 is : 2114
length of lemmatized_words_1 is : 2114
length of dict_words_1 is : 11148
length of corpus_1 is : 2114
length of topics_features is : 2114
length of gen_df_mid_1 is : 2114
length of



the average of 10 rounds is 0.5193642135384067




the average of 10 rounds is 0.48625815970669767




the average of 10 rounds is 0.5397277117052669




the average of 10 rounds is 0.512295448448538
complement naive bayes classifier:
the average of 10 rounds is 0.5643364928909953
the average of 10 rounds is 0.5269493874631136
the average of 10 rounds is 0.5501229544844853
the average of 10 rounds is 0.5297818116784405
                          song        artist   valence   arousal  \
0                       No One  Marc Anthony  1.071901  0.846830   
1  We Are Nowhere and It's Now   Bright Eyes -0.538098 -0.930065   

                                              lyrics  \
0  i whisper words about you endlessly mostly to ...   
1  if you hate the taste of wine why do you drink...   

                    found_song  found_artist  y  
0                       No One  Marc Anthony  1  
1  We Are Nowhere and It's Now   Bright Eyes -1  
df_1.y.unique() is  [ 1 -1]
length of y1 is :
2114
length of y2 is : 2114
length of train_dflist_1 is : 2114
length of lemmatized_words_1 is : 2114


KeyboardInterrupt: 

In [None]:


#read data
data_file = [const.CLEAN_UNEVEN_SPOTIFY, const.CLEAN_UNEVEN_DEEZER, const.CLEAN_SPOTIFY, const.CLEAN_DEEZER]
for every in data_file:
    df_full = pd.read_csv(every)
    print(df_full.shape)
    df_1 = utils.get_class_based_data(df_full, 1, limit_size = True) #
    df_2 = utils.get_class_based_data(df_full, 2, limit_size = True)
    df_3 = utils.get_class_based_data(df_full, 3, limit_size = True)
    df_4 = utils.get_class_based_data(df_full, 4, limit_size = True)
    df_1.reset_index()
    df_2.reset_index()
    df_3.reset_index()
    df_4.reset_index()
    TRAIN_SAMPLE = int(len(df_1))
    Y1 = df_1.iloc[:TRAIN_SAMPLE, -1]
    Y2 = df_2.iloc[:TRAIN_SAMPLE, -1]
    Y3 = df_3.iloc[:TRAIN_SAMPLE, -1]
    Y4 = df_4.iloc[:TRAIN_SAMPLE, -1]
    train_dflist_1 = df_1.lyrics[:TRAIN_SAMPLE].tolist()
    train_dflist_2 = df_2.lyrics[:TRAIN_SAMPLE].tolist()
    train_dflist_3 = df_3.lyrics[:TRAIN_SAMPLE].tolist()
    train_dflist_4 = df_4.lyrics[:TRAIN_SAMPLE].tolist()

    lemmatized_words_1 = df_to_lemmatized_words(train_dflist_1)
    lemmatized_words_2 = df_to_lemmatized_words(train_dflist_2)
    lemmatized_words_3 = df_to_lemmatized_words(train_dflist_3)
    lemmatized_words_4 = df_to_lemmatized_words(train_dflist_4)

    #print(lemmatized_words)

    #turn the 2d word list to dict 
    dict_words_1 = words_2d_to_dict(lemmatized_words_1)
    dict_words_2 = words_2d_to_dict(lemmatized_words_2)
    dict_words_3 = words_2d_to_dict(lemmatized_words_3)
    dict_words_4 = words_2d_to_dict(lemmatized_words_4)
    #print(dict_words)

    #there's 2 input for lDA: dictinary and the corpus
    #create bag of words
    corpus_1 =  [dict_words_1.doc2bow(simple_preprocess(" ".join(line))) for line in lemmatized_words_1]
    corpus_2 =  [dict_words_2.doc2bow(simple_preprocess(" ".join(line))) for line in lemmatized_words_2]
    corpus_3 =  [dict_words_3.doc2bow(simple_preprocess(" ".join(line))) for line in lemmatized_words_3]
    corpus_4 =  [dict_words_4.doc2bow(simple_preprocess(" ".join(line))) for line in lemmatized_words_4]

    name = str(every).split('/')[-1].split('.')[0]
    f = open('knn_'+ name +'count.txt', 'a')
    #highest score is gotten when number of topics is 6.
    NUM_TOPICS = 6
    for i in range(2, 20):
        n_neighbors = i
        lda_model_1 = build_LDA_model(dict_words_1, corpus_1)
        lda_model_2 = build_LDA_model(dict_words_2, corpus_2)

        lda_model_3 = build_LDA_model(dict_words_3, corpus_3)

        lda_model_4 = build_LDA_model(dict_words_4, corpus_4)
        print("---------------------start of topic # {}".format(i), "-------------------", file = f)
   
        gen_df_mid_1 = generate_data(lda_model_1, corpus_1)
        gen_df_mid_2 = generate_data(lda_model_2, corpus_2)
        gen_df_mid_3 = generate_data(lda_model_3, corpus_3)
        gen_df_mid_4 = generate_data(lda_model_4, corpus_4)

        #randomForest_LDA(gen_df_mid_1, df_1.iloc[:TRAIN_SAMPLE, -1])

        #kfold for class 1:
        print("KNN classificaton:")
        knn_clf = KNeighborsClassifier(n_neighbors)
        knn_score_sum = kfold_classification(gen_df_mid_1, Y1, knn_clf, zscore_normalization)
    
        #randomForest_LDA(gen_df_mid_2, df_2.iloc[:TRAIN_SAMPLE, -1])
        #kfold for class 2:
  
        knn_clf = KNeighborsClassifier(n_neighbors)
        knn_score_sum = knn_score_sum + kfold_classification(gen_df_mid_2, Y2, knn_clf, zscore_normalization)
    
    
        #randomForest_LDA(gen_df_mid_3, df_3.iloc[:TRAIN_SAMPLE, -1])
        #kfold for class 3:
        #random forest classificaton:
        knn_clf = KNeighborsClassifier(n_neighbors)
        knn_score_sum = knn_score_sum + kfold_classification(gen_df_mid_3, Y3, knn_clf, zscore_normalization)
    
    
        #randomForest_LDA(gen_df_mid_4, df_4.iloc[:TRAIN_SAMPLE, -1])
        #kfold for class 4:
        #random forest classificaton:
        rf_clf = RandomForestClassifier()
        knn_clf = KNeighborsClassifier(n_neighbors)
        knn_score_sum = knn_score_sum + kfold_classification(gen_df_mid_4, Y4, knn_clf, zscore_normalization)
    
        knn_score_avg = knn_score_sum / 4
        print("topics: {}, KNN score average of 4 models: {}".format(i, knn_score_avg), file = f)
   
    f.close()

In [None]:
data_file = [const.CLEAN_UNEVEN_SPOTIFY, const.CLEAN_UNEVEN_DEEZER, const.CLEAN_SPOTIFY, const.CLEAN_DEEZER]
for every in data_file:
    name = str(every).split('/')[-1].split('.')[0]
    f = open('test_'+ name + '.txt', 'a')
    f.write('this is my name '+ name)
    f.close()

Several parameters to adjust:

the number of topics;

alpha and beta: hyperparameter that affect sparsity of the topics. Default to 1.0/num_topics

chunksize: the number of documents to be used in each training chunk

update_every: how often the model parameters should be updated

passes: the total number of training passes

In [None]:
# Create the TF-IDF model 
corpus_tfidf_1 = build_tfidf_model(corpus_1)
lda_tfidf_model_1 = build_LDA_model(dict_words_1, corpus_tfidf_1)
corpus_tfidf_2 = build_tfidf_model(corpus_2)
lda_tfidf_model_2 = build_LDA_model(dict_words_2, corpus_tfidf_2)
corpus_tfidf_3 = build_tfidf_model(corpus_3)
lda_tfidf_model_3 = build_LDA_model(dict_words_3, corpus_tfidf_3)
corpus_tfidf_4 = build_tfidf_model(corpus_4)
lda_tfidf_model_4 = build_LDA_model(dict_words_4, corpus_tfidf_4)

In [None]:
#Build LDA model
#NUM_TOPICS = 8
f = open('uneven_311_spotify_tfidf.txt', 'a')
for i in range(3, 8):
    NUM_TOPICS = i
    lda_model_1 = build_LDA_model(dict_words_1, corpus_tfidf_1)
    lda_model_2 = build_LDA_model(dict_words_2, corpus_tfidf_2)

    lda_model_3 = build_LDA_model(dict_words_3, corpus_tfidf_3)

    lda_model_4 = build_LDA_model(dict_words_4, corpus_tfidf_4)
    print("---------------------start of topic # {}".format(i), "-------------------", file = f)
    print_coherence_perplexity(lda_model_1, lemmatized_words_1, dict_words_1, corpus_tfidf_1)
    
    print_coherence_perplexity(lda_model_2, lemmatized_words_2, dict_words_2, corpus_tfidf_2)

    print_coherence_perplexity(lda_model_3, lemmatized_words_3, dict_words_3, corpus_tfidf_3)

    print_coherence_perplexity(lda_model_4, lemmatized_words_4, dict_words_4, corpus_tfidf_4)

    gen_df_mid_1 = generate_data(lda_model_1, corpus_tfidf_1)
    gen_df_mid_2 = generate_data(lda_model_2, corpus_tfidf_2)
    gen_df_mid_3 = generate_data(lda_model_3, corpus_tfidf_3)
    gen_df_mid_4 = generate_data(lda_model_4, corpus_tfidf_4)

    #randomForest_LDA(gen_df_mid_1, df_1.iloc[:TRAIN_SAMPLE, -1])

    #kfold for class 1:
    print("random forest classificaton:")
    rf_clf = RandomForestClassifier()
    rf_score_sum = kfold_classification(gen_df_mid_1, Y1, rf_clf, zscore_normalization)
    
    #randomForest_LDA(gen_df_mid_2, df_2.iloc[:TRAIN_SAMPLE, -1])
    #kfold for class 2:
    #random forest classificaton:
    rf_clf = RandomForestClassifier()
    rf_score_sum = rf_score_sum + kfold_classification(gen_df_mid_2, Y2, rf_clf, zscore_normalization)
    
    #randomForest_LDA(gen_df_mid_3, df_3.iloc[:TRAIN_SAMPLE, -1])
    #kfold for class 3:
    #random forest classificaton:
    rf_clf = RandomForestClassifier()
    rf_score_sum = rf_score_sum + kfold_classification(gen_df_mid_3, Y3, rf_clf, zscore_normalization)
    
    #randomForest_LDA(gen_df_mid_4, df_4.iloc[:TRAIN_SAMPLE, -1])
    #kfold for class 4:
    #random forest classificaton:
    rf_clf = RandomForestClassifier()
    rf_score_sum = rf_score_sum + kfold_classification(gen_df_mid_4, Y4, rf_clf, zscore_normalization)
    rf_score_avg = rf_score_sum / 4
    print("topics: {}, RF score average of 4 models: {}".format(i, rf_score_avg), file = f)

    print("complement naive bayes classifier:")
    cnb_clf = ComplementNB(alpha= 0.1)
    cnb_score_sum = kfold_classification(gen_df_mid_1, Y1, cnb_clf)
    
    #complement naive bayes classifier:
    cnb_clf = ComplementNB(alpha= 0.1)
    cnb_score_sum = cnb_score_sum + kfold_classification(gen_df_mid_2, Y2, cnb_clf)

    #complement naive bayes classifier:
    cnb_clf = ComplementNB(alpha= 0.1)
    cnb_score_sum = cnb_score_sum + kfold_classification(gen_df_mid_3, Y3, cnb_clf)
    
    #complement naive bayes classifier:
    cnb_clf = ComplementNB(alpha= 0.1)
    cnb_score_sum = cnb_score_sum + kfold_classification(gen_df_mid_4, Y4, cnb_clf)
    cnb_score_avg = cnb_score_sum / 4
    print("Complement NB score average of 4 models: {}".format(cnb_score_avg), file = f)
    print("---------------------end of topic # {}".format(i), "-------------------", file = f)
    f.flush()
f.close()

In [None]:
#visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_1, corpus_1, dict_words_1)
vis

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model_2, corpus_2, dict_words_2)
vis

In [None]:
# # Show the Word Weights in Corpus
# num_row = 0
# for doc in corpus:#every row in corpus: id of every word in the doc and the freq of that word in the doc
#     num_row += 1
#     print([[dict_words[id], freq] for id, freq in doc])
#     print("\n{}".format(num_row),"---------------------------------")

#Gensim creates a unique id for each word in the document. The produced corpus shown above is a mapping of (word_id, word_frequency).

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model_3, corpus_3, dict_words_3)
vis

In [None]:
vis = pyLDAvis.gensim.prepare(lda_model_4, corpus_4, dict_words_4)
vis

model_1 = "model_1"

lda_model_1.save(model_1)

# print(corpus[0])
# for item in lda_model[corpus[0]]:
#     print("\n", item)

In [None]:
# model_2 = "model_2"
# model_3 = "model_3"
# model_4 = "model_4"

# lda_model_2.save(model_2)
# lda_model_3.save(model_3)
# lda_model_4.save(model_4)

from gensim.test.utils import datapath

#save the model to disk
pretrained_model_1 = datapath("pretrained_model_1")
lda_model_1.save(pretrained_model_1)

#input new document to the LDA model, and see the score of classification

In [None]:
#view the topics in LDA model: the keyword in the num_topics topics
# from pprint import pprint
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

#load the pretrained model from disk
lda_model_1 = LdaModel.load(pretrained_model_1)

In [None]:
#evaluate the model with tf-idf
print_coherence_perplexity(lda_tfidf_model_1, lemmatized_words_1, dict_words_1, corpus_tfidf_1)

In [None]:
gen_df_tfidf_1 = generate_data(lda_tfidf_model_1, corpus_tfidf_1)
randomForest_LDA(gen_df_tfidf_1, df_1.iloc[:TRAIN_SAMPLE, -1])
print("Classify with Random Forest:")
rf_clf = RanfomForestClassifier()
kfold_classification(gen_df_tfidf_1, Y1, rf_clf)
print("Classify with Complement Naive Bayes classifier:")
cnb_clf = ComplementNB(alpha = 0.1)
kfold_classification(gen_df_tfidf_1, Y1, cnb_clf)

## As we can see that after applying tf-idf model as the corpus input, the cohenrence score increased by 0.01, while the classification score decreased by 0.15.

In [None]:
from sklearn.linear_model import LassoCV
#it's said the best model will be chosen by cross-validation


## Improve upon this model by using Mallet's version of LDA algorithm, which always gives a better quality of topics than Gensim's inbuilt version of the LDA algorithm.

In [None]:
#use mallet's implementation of LDA
mallet_path = "../mallet-2.0.8/bin/mallet"
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus_1, num_topics=NUM_TOPICS, id2word=dict_words_1)
# evaluate the mallet model: Compute Coherence Score
coherence_model = CoherenceModel(model = ldamallet, texts = lemmatized_words_1, dictionary = dict_words_1, coherence = 'c_v')
coherence_lda = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda)

#classify the dataset using output of mallet's LDA model
gen_df_mallet_1 = generate_data(ldamallet, corpus_1)
randomForest_LDA(gen_df_mallet_1, df_1.iloc[:TRAIN_SAMPLE, -1])
print("Classfy using data generated by Mallet's LDA with Random Forest Classifier:")
## AttributeError: 'LdaMallet' object has no attribute 'get_document_topics'

In [None]:
#try tf-idf on mallet
ldamallet_tfidf = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus_tfidf_1, num_topics=NUM_TOPICS, id2word=dict_words_1)
# evaluate the mallet model: Compute Coherence Score
coherence_model = CoherenceModel(model = ldamallet_tfidf, texts = lemmatized_words_1, dictionary = dict_words_1, coherence = 'c_v')
coherence_lda = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda)

How to infer pyLDAvis's output?

Each bubble represents a topic. The larger the bubble, the more prevalent is that topic.

A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.

A model with too many topics, will typically have many overlaps, small sized bubbles, clustered in one region of the chart

So this model ... maybe use fewer number of topics

You could determine the number of topics using prior knowledge of the number of natural topics in the document

In [None]:
#We will focus on how to arrive at the optimal number of topics given any large corpus of text
#TODO: I really want to know what can gridsearch do in finding the optimal number of topics

In [None]:
# Show the TF-IDF weights
for doc in tfidf[corpus]:
    print([[dict_words[id], np.around(freq, decimals=2)] for id, freq in doc])