### SYS6016 Midterm Project

#### Rohan Bapat and Jack Prominski

##### Logistic Regression

In [405]:
import gensim
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from gensim.models.doc2vec import TaggedDocument, TaggedLineDocument, LabeledSentence
import pandas as pd
import numpy as np
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

In [2]:
# Load data

df = pd.read_csv('clean_lyrics_all.csv')
df.head()

Unnamed: 0,song,year,artist,genre,lyrics,clean_lyrics
0,ego-remix,2009,beyonce-knowles,Pop,"Oh baby, how you doing?\nYou know I'm gonna cu...",oh babi know gon na cut right chase women made...
1,then-tell-me,2009,beyonce-knowles,Pop,"playin' everything so easy,\nit's like you see...",playin everyth easi like seem sure still way d...
2,honesty,2009,beyonce-knowles,Pop,If you search\nFor tenderness\nIt isn't hard t...,search tender hard find love need live look tr...
3,you-are-my-rock,2009,beyonce-knowles,Pop,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote...",oh oh oh oh oh oh vers wrote book stand titl b...
4,black-culture,2009,beyonce-knowles,Pop,"Party the people, the people the party it's po...",parti peopl peopl parti pop sit around see loo...


In [4]:
# Dealing with missing data -- drop obs with missing lyrics and unknown/other genre
df = df.dropna()
df = df[(df.genre != 'Other') & (df.genre != 'Not Available')]

# create label -- unique id for each song
df['label'] = df['artist'].map(str) + df['song']

In [5]:
#downsample the dataset

genres = list(set(df.genre))
subdfs = []
nobs = df.groupby('genre')['genre'].count()

for genre in genres:
    if nobs[nobs.index == genre].values[0] > 10000:
        arr = np.array(df.index[df.genre == genre])
        samp = list(np.random.choice(arr, 10000, replace=False))
        subdfs.append(df.loc[df.index.isin(samp)])
    else:
        subdfs.append(df[df.genre == genre])

subdf = pd.concat(subdfs)

#### Logit w/ TFIDF

In [7]:
subdf = subdf.reset_index()
subdf.head()

Unnamed: 0,index,song,year,artist,genre,lyrics,clean_lyrics,label
0,358,white-trash,2004,borialis,Rock,Where should I begin cripplin' all you villain...,begin cripplin villain never injur civilian in...,borialiswhite-trash
1,359,don-t-mean-a-thing,2004,borialis,Rock,"Enough of all that, let's switch up the format...",enough let switch format talk trash get bore f...,borialisdon-t-mean-a-thing
2,382,surefire,2007,brightwood,Rock,"Looking back, no turning back.\nWouldn't take ...",look back turn back would take back second uns...,brightwoodsurefire
3,383,taken,2007,brightwood,Rock,I am taken\nI am not my own\nI am floating\nTe...,taken float teach fli solomon wait sing hi son...,brightwoodtaken
4,599,red-threat,2007,fang,Rock,"Charles Manson is god, he was a crazy sod\nHe ...",charl manson god wa crazi sod went deep end ki...,fangred-threat


In [8]:
# Apply CountVectorizer
cv = CountVectorizer()
cv_fit=cv.fit_transform(subdf.clean_lyrics)

In [9]:
# Get Vocabulary
tf_matrix_word = cv.get_feature_names()

In [10]:
# Get word frequency for each word in vocabulary
tf_matrix_freq = cv_fit.sum(axis = 0)
tf_matrix_freq = np.array(tf_matrix_freq).flatten()

In [12]:
# Create df with words and frequencies
tf_df = pd.DataFrame({'word':tf_matrix_word, 'freq':tf_matrix_freq})

In [13]:
# Sort the words in tf_df by word frequency in descending order
tf_df.sort_values(by='freq',ascending=False, inplace=True)

# The index column will be used will be used to remove head and tail words
tf_df.reset_index(inplace=True)

# The level_0 column will be used to assign rank to the word frequency, words with highest frequency
# get top rank
tf_df.reset_index(inplace=True)
tf_df.rename(columns={'level_0':'rank'}, inplace = True)

In [14]:
# Get head words
# Remove 20 words with highest frequency as head words
head_word_count = 20
head_words = list(tf_df['index'].head(head_word_count))

# Get tail words
# Remove 20 words with frequency less than 5 as tail words
tail_word_freq = 5
tail_words = list(tf_df.loc[tf_df['freq']<tail_word_freq,'index'])

# Combine head and tail words
head_words.extend(tail_words)

In [15]:
# Transform BOW to TFIDF
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
tfidf = tfidf_vectorizer.fit_transform(subdf.clean_lyrics)

In [16]:
# Subset for to remove head and tail words
keep_words = list(set(np.arange(tfidf.shape[1])) - set(head_words))
X = tfidf[:,keep_words]
y = subdf[['genre']].as_matrix()

In [24]:
# Create training and testing sets
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [365]:
X_train_tfidf.shape

(59766, 37599)

In [366]:
X_train_tfidf

<59766x37599 sparse matrix of type '<class 'numpy.float64'>'
	with 3581292 stored elements in Compressed Sparse Row format>

In [None]:
# Fit logistic regression model
t0 = time()
m_logclf = LogisticRegression(multi_class='multinomial',solver='sag', n_jobs=7)
m_logclf.fit(X_train_tfidf, y_train)
print("--- %s seconds ---" % (time() - t0))

In [368]:
# Calculate accuracy
preds = m_logclf.predict(X_test_tfidf)
y_test_resh = y_test.reshape((1, -1))
y_list = list(y_test_resh[0])
sum(np.equal(preds,np.array(y_list)))/len(preds)

0.48581180564850757

In [369]:
# Generate classification report
print(classification_report(preds, y_test))

             precision    recall  f1-score   support

    Country       0.64      0.52      0.58      2568
 Electronic       0.37      0.34      0.35      1739
       Folk       0.13      0.74      0.23        81
    Hip-Hop       0.79      0.78      0.78      1992
      Indie       0.05      0.27      0.09       119
       Jazz       0.50      0.50      0.50      1633
      Metal       0.72      0.63      0.67      2256
        Pop       0.39      0.34      0.37      2288
        R&B       0.10      0.50      0.17       139
       Rock       0.30      0.28      0.29      2127

avg / total       0.53      0.49      0.50     14942



#### Logit w/ Doc2Vec

In [203]:
# Create lists of words (x), genres (y), and labels
labels = subdf['label'].tolist()
genre = subdf['genre'].tolist()
words = subdf['clean_lyrics'].tolist()

In [319]:
# Write words out to text file, one per line
file = open('d2v_docs_2.txt', 'w')
for doc in words:
    file.write("%s\n" % doc)

In [320]:
# Generate TaggedLineDocuments to feed into Doc2Vec
docs = [doc for doc in TaggedLineDocument('d2v_docs_2.txt')]

In [None]:
# Train Doc2Vec model
t0 = time()
model = Doc2Vec(docs, size=200, window=5, min_count=5, workers=7, iter=20)
print("--- %s seconds ---" % (time() - t0))

In [407]:
# Save model
model.save('model_200.doc2vec')

In [312]:
#model = Doc2Vec.load('model_200.doc2vec')

In [409]:
# Get document vectors to pass to Logit
vectors = []
for i in range(0,74708):
    vectors.append(model.docvecs[i])
vectors = np.asarray(vectors)

In [410]:
# Verify shape
vectors.shape

(74708, 200)

In [411]:
# Create training and testing sets
X_train_d2v, X_test_d2v, y_train_d2v, y_test_d2v = train_test_split(vectors, genre, test_size=0.2, random_state=42)

In [None]:
# Fit logistic regression model
t0 = time()
logclf = LogisticRegression(multi_class='multinomial',solver='sag', n_jobs=7)
logclf.fit(X_train_d2v, y_train)
print("--- %s seconds ---" % (time() - t0))

In [413]:
# Calculate accuracy
preds = logclf.predict(X_test_d2v)
y_test_resh = y_test.reshape((1, -1))
y_list = list(y_test_resh[0])
sum(np.equal(preds,np.array(y_list)))/len(preds)

0.24153393120064248

In [414]:
# Generate classification report
print(classification_report(preds, y_test))

             precision    recall  f1-score   support

    Country       0.31      0.22      0.26      2992
 Electronic       0.03      0.11      0.04       362
       Folk       0.05      0.42      0.09        52
    Hip-Hop       0.51      0.34      0.41      2964
      Indie       0.00      0.00      0.00         5
       Jazz       0.11      0.23      0.15       807
      Metal       0.50      0.26      0.34      3913
        Pop       0.19      0.18      0.18      2087
        R&B       0.00      0.20      0.01        15
       Rock       0.17      0.19      0.18      1745

avg / total       0.35      0.24      0.28     14942

