In [None]:
%pylab inline
import numpy 
import matplotlib.pyplot as plt
import sklearn

# Import all of the scikit learn stuff
from __future__ import print_function
from sklearn.decomposition import TruncatedSVD, PCA, NMF
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import pandas as pd

# Others
from more_itertools import flatten
from sklearn.feature_extraction import text 
import plotly.plotly as py
import plotly.graph_objs as go

#use this format for working locally
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot, plot_mpl
init_notebook_mode(connected=True)

In [None]:
# Setting up data
df = pd.read_csv('../Desktop/lyrics.csv', index_col=0)
df.annotations.fillna('', inplace=True)
df['lyrics_anno'] = df.lyrics + df.annotations

artistname_stopwords = [i.split() for i in df.artist]

artistname_stopwords= list(flatten(artistname_stopwords))

artistname_stopwords = [i.lower() for i in artistname_stopwords]

my_additional_stop_words = ['like', 'yeah', 'im', 'dont', 'just', 'got', 'verse', 'chorus', 'know', 'lil' 'uh',
                            'ive', 'song', 'line', 'youre', 'hes', 'people', 'track', 'drakes', 'niggas', 'shit', 'thats']+artistname_stopwords

stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)



In [None]:
vectorizer = TfidfVectorizer(min_df=1, max_df=.5, stop_words = stop_words)
dtm = vectorizer.fit_transform(df.annotations) 

In [None]:
lsa = NMF(5)
dtm_lsa = lsa.fit_transform(dtm)
#dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

In [None]:
df['topic'] = np.NaN

In [None]:
df.head()

In [None]:
def get_topic(dtm_lsa, df):

    for i, _  in enumerate(dtm_lsa):

        index = int(np.argmax(dtm_lsa[i]))

        if index == 0:
            # sex, drugs, rap
            df.set_value(i, 'topic', 1)
        elif index == 1:
            # feel good love
            df.set_value(i, 'topic', 2)
        elif index == 2:
            # spanish
            df.set_value(i, 'topic', 3)
        elif index == 3:
            # loved and lost
            df.set_value(i, 'topic', 4)
        else:
            # in a relationship
            df.set_value(i, 'topic', 5)


In [None]:
df.to_csv('../Desktop/final_final_df.csv')

In [None]:
get_topic(dtm_lsa, df)

In [None]:
df.topic.value_counts()

In [None]:
print(mean(df[df.topic==1].index))
print(mean(df[df.topic==2].index))
print(mean(df[df.topic==3].index))
print(mean(df[df.topic==4].index))
print(mean(df[df.topic==5].index))

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(lsa, vectorizer.get_feature_names(), 20)

In [None]:
pca = PCA(n_components=3)
pca.fit(dtm_lsa)

pca.explained_variance_ratio_

pca.components_



In [None]:
#where the dots are going to be on the plot

pca_components = pca.components_
points_to_plot=pca.transform(dtm_lsa)

In [None]:
type(points_to_plot)

In [None]:
go.Scatter3d()

In [None]:
x, y, z = points_to_plot[:,0], points_to_plot[:,1], points_to_plot[:,2]

trace1 = go.Scatter3d(
    x=x,
    y=y,
    z=z,
    text=df.track,
    mode='markers',
    marker=dict(
        size=12,
        color=df.topic,                # set color to an array/list of desired values
        colorscale='Viridis',   # choose a colorscale
        opacity=0.8
    ),
    hoverinfo='text'
)

data = [trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ), showlegend=True
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='3d-scatter-colorscale')

In [None]:
# Blue is 'feel good love'
# Yellow is 'various stages in a relationship'
# Green is 'loved and lost'
# Purple is 'sex, drugs and rap'
# Auqa-green is 'espanol'


In [None]:
df[df.track == "Lucid Dreams"]

In [None]:
dtm_lsa[12]

In [None]:
df.track.head(50)