In [None]:
import requests
import sys
import pandas as pd

sys.path.append('/Users/krishna/Desktop/')

from genius_credentials import credentials
from bs4 import BeautifulSoup
import pickle
import string
import re


import numpy 
import matplotlib.pyplot as plt
import sklearn

# Import all of the scikit learn stuff
from __future__ import print_function
from sklearn.decomposition import TruncatedSVD, PCA, NMF
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import pandas as pd

# Others
from more_itertools import flatten
from sklearn.feature_extraction import text 
import plotly.plotly as py
import plotly.graph_objs as go

#use this format for working locally
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot, plot_mpl
init_notebook_mode(connected=True)

In [None]:
top_200 = pd.read_csv('./Desktop/regional-us-weekly-latest.csv', header=1)
top_200.drop('URL', axis=1, inplace=True)
top_200.rename(columns={'Track Name': 'Track_Name'}, inplace=True)

In [None]:
access_token = credentials['client_access_token']

In [None]:
def request_search_info(artist, track):
    """Takes artist and track. Reuturns a song api_path string that you then feed into 
    the get_referents_info function that then can go get all the annotations """
    
    #set up for search
    base_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + access_token}
    search_url = base_url + '/search'
    data = {'q': track + ' ' + artist}
    
    #GET request
    response = requests.get(search_url, headers=headers, data=data).json()
    
    remote_song_info = None
    
    #check to get the right song api_path by seeing if the artist matches the artist on response
    for hit in response['response']['hits']:
        if artist.lower() in hit['result']['primary_artist']['name'].lower():
            remote_song_info = hit
            break
    
    #if no match return None
    #if remote_song_info == None:
        #return None
    
    if remote_song_info:
        song_url = remote_song_info['result']['url']
        page = requests.get(str(song_url))
        html = BeautifulSoup(page.text, 'html.parser')
        lyrics = html.find('div', class_='lyrics').get_text().replace('\n', ' ').strip()
    
    #split the '/songs/number' string to get number
    song_api_path = remote_song_info['result']['api_path'].split('/', maxsplit=2)[2]
    
    annotations = request_referents_info(song_api_path)
   
    #return the number
    return lyrics, annotations




def request_referents_info(song_api_path):
    """Takes the song_api_path and gets all the annotations that are attached to 
    referents"""
    
    client_access_token = credentials['client_access_token']

    id_ = str(song_api_path)
    params = {'song_id': id_, 'text_format':'plain'}
    headers = {'Authorization': 'Bearer {}'.format(client_access_token)}
    r = requests.get('https://api.genius.com/referents?'+str(song_api_path), headers=headers, params=params)
    json = r.json()

    annotations = ''
    
    for i in range(len(json['response']['referents']) - 1):
        #iterate over the length of this to get to 'body'
        annotations += json['response']['referents'][i]['annotations'][0]['body']['plain']
    
    annotations.replace('\n', ' ').replace('  ', ' ')
    
    return annotations



def cleaning(text):

    
    exclude = set(string.punctuation)

    
    # remove new line and digits with regular expression
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\d', '', text)
    # remove patterns matching url format
    url_pattern = r'((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'
    text = re.sub(url_pattern, ' ', text)
    # remove non-ascii characters
    text = ''.join(character for character in text if ord(character) < 128)
    # remove punctuations
    text = ''.join(character for character in text if character not in exclude)
    # standardize white space
    text = re.sub(r'\s+', ' ', text)
    # drop capitalization
    text = text.lower()
    #remove white space
    text = text.strip()

    return text




def get_lyrics_annotations(df):
    
    lyrics_list = []
    annotations_list = []
    artist_list = []
    track_list = []
    
    for i in df.index:
        
        try: 
            lyrics, annotations = request_search_info(df.Artist[i], df.Track_Name[i])

            
        except TypeError:
            
            print(df.Track_Name[i]) #prints which tracks aren't  
            continue
            
        
        else:
            
            lyrics_list.append(cleaning(lyrics))

            annotations_list.append(cleaning(annotations))
            
            artist_list.append(df.Artist[i])
            
            track_list.append(df.Track_Name[i])
            
            
    
    d = {'artist': artist_list, 'track': track_list, 'lyrics': lyrics_list, 'annotations': annotations_list}
            
    df=pd.DataFrame(data = d)
        
            
    return df



def cleaning2(text):
    """keeps commas and periods"""
    
    text = re.sub(r'\b(?:(?:https?|ftp)://)?\w[\w-]*(?:\.[\w-]+)+\S*(?<![.,])', ' ', text.lower())
    words = re.findall(r'[a-z.,]+', text)
    return ' '.join(words)


In [None]:
final_df = get_lyrics_annotations(top_200)
final_df.to_csv('./Desktop/lyrics.csv')
df = pd.read_csv('./Desktop/lyrics.csv', index_col=0)
final_df[final_df.annotations == ''].index

***NMF Modeling***

In [None]:
# Setting up data
df = pd.read_csv('../Desktop/lyrics.csv', index_col=0)
df.annotations.fillna('', inplace=True)
df['lyrics_anno'] = df.lyrics + df.annotations

artistname_stopwords = [i.split() for i in df.artist]

artistname_stopwords= list(flatten(artistname_stopwords))

artistname_stopwords = [i.lower() for i in artistname_stopwords]

my_additional_stop_words = ['like', 'yeah', 'im', 'dont', 'just', 'got', 'verse', 'chorus', 'know', 'lil' 'uh',
                            'ive', 'song', 'line', 'youre', 'hes', 'people', 'track', 'drakes', 'niggas', 'shit', 'thats']+artistname_stopwords

stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)



In [None]:
vectorizer = TfidfVectorizer(min_df=1, max_df=.5, stop_words = stop_words)
dtm = vectorizer.fit_transform(df.annotations) 

lsa = NMF(5)
dtm_lsa = lsa.fit_transform(dtm)
#dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

df['topic'] = np.NaN

In [None]:
def get_topic(dtm_lsa, df):

    for i, _  in enumerate(dtm_lsa):

        index = int(np.argmax(dtm_lsa[i]))

        if index == 0:
            # sex, drugs, rap
            df.set_value(i, 'topic', 1)
        elif index == 1:
            # feel good love
            df.set_value(i, 'topic', 2)
        elif index == 2:
            # spanish
            df.set_value(i, 'topic', 3)
        elif index == 3:
            # loved and lost
            df.set_value(i, 'topic', 4)
        else:
            # in a relationship
            df.set_value(i, 'topic', 5)

            

            

In [None]:
get_topic(dtm_lsa, df)

In [None]:
# Finding the distribution of topics
df.topic.value_counts()

# Finding the mean index of each topic
print(mean(df[df.topic==1].index))
print(mean(df[df.topic==2].index))
print(mean(df[df.topic==3].index))
print(mean(df[df.topic==4].index))
print(mean(df[df.topic==5].index))

In [None]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(lsa, vectorizer.get_feature_names(), 20)

In [None]:
pca = PCA(n_components=3)
pca.fit(dtm_lsa)

pca.explained_variance_ratio_

pca.components_

pca_components = pca.components_
points_to_plot=pca.transform(dtm_lsa)

In [None]:
x, y, z = points_to_plot[:,0], points_to_plot[:,1], points_to_plot[:,2]

trace1 = go.Scatter3d(
    x=x,
    y=y,
    z=z,
    text=df.track,
    mode='markers',
    marker=dict(
        size=12,
        color=df.topic,                # set color to an array/list of desired values
        colorscale='Viridis',   # choose a colorscale
        opacity=0.8
    ),
    hoverinfo='text'
)

data = [trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    ), showlegend=True
)
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='3d-scatter-colorscale')

***LDA Modeling***

In [None]:
import pandas as pd
from gensim import corpora, models, similarities, matutils
from sklearn.feature_extraction.text import CountVectorizer

from more_itertools import flatten

from collections import Counter
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


from sklearn.feature_extraction import text 

In [None]:
count_vectorizer = CountVectorizer(analyzer='word',stop_words=stop_words, 
                                   token_pattern='\\b[a-z][a-z]+\\b', max_df=.4)


count_vectorizer.fit(df.annotations)

counts = count_vectorizer.transform(df.annotations).transpose()

In [None]:
corpus = matutils.Sparse2Corpus(counts)
id2word = dict((v,k) for k, v in count_vectorizer.vocabulary_.items())

lda = models.LdaModel(corpus=corpus, num_topics=5, id2word=id2word, passes=10)

In [None]:
lda.print_topics()

In [1]:
!pwd

/Users/krishna/Project_4
