# Assignment 3

\- Si Nguyen Mai, May 26, 2018 -

In [28]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Doc2Vec

%run Doc2VecHelperFunctions.ipynb

from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


## Import my pre-processed data frame

Code for preprocessing data is in "Cleaning_Song-dataset" notebook in the "root" directory

There are 2 pre-processed file:
- 'assignment-3_big-dataset.csv': contains lyrics for all kinds of mood
- 'assignment-3_small-dataset.csv': contains lyrics for only 'happy' and 'aggressive', and column 'Index' for corresponding indices in 'assignment-3_big-dataset.csv'

All the lyrics are clean (stop words, punctuation removed)

In [2]:
big_df = pd.read_csv('assignment-3_big-dataset.csv')
print(big_df.shape)
big_df.head()

(20930, 2)


Unnamed: 0,lyrics_features,moods
0,oppa gangnam style gangnam style najeneun ttas...,"['energetic', 'motivational']"
1,late ve ve lose sleep dream thing babi ve ve p...,['happy']
2,parti rock yeah woo let s parti rock hous toni...,"['happy', 'celebratory', 'rowdy']"
3,alagamun lan weh wakun heya hanun gon alagamun...,"['happy', 'energetic', 'celebratory']"
4,j lo s new generat mr worldwid parti peopl flo...,['energetic']


In [3]:
target_df = pd.read_csv('assignment-3_small-dataset.csv')
print(target_df.shape)
target_df.head()

(3440, 3)


Unnamed: 0,index,lyrics_features,moods
0,1,late ve ve lose sleep dream thing babi ve ve p...,happy
1,2,parti rock yeah woo let s parti rock hous toni...,happy
2,3,alagamun lan weh wakun heya hanun gon alagamun...,happy
3,5,today don t feel like do just wanna lay bed do...,happy
4,9,don t know turn head walk o don t need make co...,happy


Check the *equality* in sample size between two classes

In [4]:
# The number of 'happy' lyrics:
print(len(target_df[target_df['moods'] == 'happy']))

# The number of 'aggressive' lyrics:
print(len(target_df[target_df['moods'] == 'aggressive']))

1757
1683


In [26]:
labels = np.ravel(target_df['moods'])

labels

array(['happy', 'happy', 'happy', ..., 'happy', 'aggressive', 'happy'], dtype=object)

## Preparing features input 

### Bag of words

In [5]:
count_vect = CountVectorizer()

bow = count_vect.fit_transform(target_df['lyrics_features'])

bow.shape

(3440, 22570)

### TF - IDF

In [7]:
tfidf_vect = TfidfVectorizer()

tf_idf = tfidf_vect.fit_transform(target_df['lyrics_features'])

tf_idf.shape

(3440, 22570)

### Doc2Vec

In [10]:
all_lyrics = big_df['lyrics_features'].tolist()

print(type(all_lyrics))
len(all_lyrics)

<class 'list'>


20930

In [11]:
#convert_lyrics_to_d2v(all_lyrics)

In [13]:
doc2vec_model = Doc2Vec.load('./song_lyrics.d2v')

In [27]:
doc2vec = doc2vec_model[target_df['index']]

doc2vec

array([[ 1.23539937, -1.28101766, -1.48635399, ...,  1.57250249,
         1.25255394,  1.86322331],
       [ 2.20976281, -0.12866332, -2.41090131, ..., -0.2270617 ,
         0.7468949 ,  2.48212528],
       [ 3.03357291, -0.41046563, -0.86521643, ...,  0.09651477,
        -0.23563212,  0.65808886],
       ..., 
       [ 1.61306369,  0.28757182, -2.39796233, ..., -0.78782743,
        -0.6883263 , -0.12553684],
       [ 0.43706188,  0.17893283,  0.37903407, ..., -0.66263926,
         1.11233342,  0.1141317 ],
       [ 0.80377418, -0.89266783, -0.86730605, ..., -1.48051941,
        -0.44940835,  0.12566899]], dtype=float32)

OK. <br>
So now we get 3 sets of feature data from 3 methods: `bow`, `tf_idf`, and `doc2vec`. <br>
Also, our labels array named: `labels`

## Classification

In [49]:
np.random.seed(999)

In [50]:
feat_methods = ['bow', 'tf_idf', 'doc2vec']

In [51]:
train_test_sets = {
    'bow': train_test_split(bow, labels, test_size = 0.3),
    'tf_idf': train_test_split(tf_idf, labels, test_size = 0.3),
    'doc2vec': train_test_split(doc2vec, labels, test_size = 0.3)
}
# In a key - value pair, value is a list in format of [X_train, X_test, y_train, y_test]

In [68]:
def optimizing (estimator, param_grid, train_test_list, n_jobs):
    
    output = {}
    
    grid = GridSearchCV(estimator, param_grid, refit = True, n_jobs = n_jobs)
    grid.fit(train_test_list[0], train_test_list[2]) 
    # train_test_list is a list in form of [X_train, X_test, y_train, y_test]
    
    output['estimator'] = grid.best_estimator_
    output['params'] = grid.best_params_
    output['prediction'] = grid.predict(train_test_list[1])
    
    return output

In [73]:
def estimator_searching (init_classifiers, param_grids, train_test_list, n_jobs):
    
    classifiers = {method : {} for method in init_classifiers.keys()}
    
    for method in classifiers.keys():
        
        estimator = init_classifiers [method]
        param_grid = param_grids [method]
        
        classifiers[method] = optimizing(estimator, param_grid, train_test_list, n_jobs)
    
    return classifiers

In [88]:
init_classifiers_0 = {
        'knn': KNeighborsClassifier(),
        'logreg': LogisticRegression(),
        'svm': SVC(),
        'rfc': RandomForestClassifier()
    }
    
param_grids_0 = {
    'knn': {'n_neighbors': [5, 10, 25, 50]},
    'logreg': 
    {
        'solver': ['liblinear'], 
    }, 
    'svm': 
    {
        'C': [0.1, 1, 10],
        'gamma': [1, 0.1, 0.01, 0.001]
    },
    'rfc': 
    {
        'n_estimators': [5, 10, 100],
        'min_samples_split': [2, 3, 4, 5, 10],
        'max_features': ['sqrt', 'log2', 'auto']
    }
}

In [89]:
classifiers_0 = { method : estimator_searching(init_classifiers_0, param_grids_0, 
                                               train_test_sets[method], n_jobs = 2) 
                 for method in feat_methods }

In [90]:
for key in classifiers_0.keys():
    
    print(key)
    for model in init_classifiers_0.keys():
        print(model)
        print(precision_score(train_test_sets[key][3], classifiers_0[key][model]['prediction'], average='micro'))
    
    print("\n")

bow
knn
0.643410852713
logreg
0.747093023256
svm
0.768410852713
rfc
0.815891472868


tf_idf
knn
0.763565891473
logreg
0.801356589147
svm
0.798449612403
rfc
0.813953488372


doc2vec
knn
0.561046511628
logreg
0.789728682171
svm
0.790697674419
rfc
0.772286821705


