In [964]:
# Import Modules
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import pyspark
import time
import lyricsgenius
import spotipy
import billboard
import time
import requests
import re
import GetOldTweets3 as got
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from langdetect import detect
from nltk.tokenize import word_tokenize
from string import punctuation

from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.evaluation import BinaryClassificationMetrics

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')
from gensim import corpora, models
from pprint import pprint
from collections import Counter

APP_NAME = "Random Forest"
SPARK_URL = "local[*]"

spark = SparkSession.builder \
    .appName(APP_NAME) \
    .master(SPARK_URL) \
    .getOrCreate()


[nltk_data] Downloading package stopwords to /Users/pk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/pk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data Collection & Exploration

In [38]:
# Read in Spotify Features Dataset based on Spotify API
spotifyfeatures = pd.read_csv("Data/Hot_100_Audio_Features.csv")

In [39]:
# Read in Billboard Chart Dataset based on the Billboard Hot 100
chartinfo = pd.read_csv("Data/Hot_Stuff.csv")

In [4]:
print (chartinfo.shape)
print (chartinfo.dtypes)

(320495, 10)
url                        object
WeekID                     object
Week Position               int64
Song                       object
Performer                  object
SongID                     object
Instance                    int64
Previous Week Position    float64
Peak Position               int64
Weeks on Chart              int64
dtype: object


In [5]:
print (spotifyfeatures.shape)
print (spotifyfeatures.dtypes)

(28492, 22)
SongID                        object
Performer                     object
Song                          object
spotify_genre                 object
spotify_track_id              object
spotify_track_preview_url     object
spotify_track_album           object
spotify_track_explicit        object
spotify_track_duration_ms    float64
spotify_track_popularity     float64
danceability                 float64
energy                       float64
key                          float64
loudness                     float64
mode                         float64
speechiness                  float64
acousticness                 float64
instrumentalness             float64
liveness                     float64
valence                      float64
tempo                        float64
time_signature               float64
dtype: object


# Data Manipulation 

In [40]:
songinfo = pd.merge(chartinfo, spotifyfeatures, how='inner', on=['SongID', 'Song', 'Performer'])
songinfo.shape

(320900, 29)

In [41]:
songinfo['WeekID'] = songinfo['WeekID'].apply(lambda x: dt.datetime.strptime(x, "%m/%d/%Y"))

In [42]:
songinfo = songinfo.set_index('WeekID')

In [43]:
songinfo = songinfo.loc['2015-01-01':'2019-12-28']

In [11]:
print (songinfo.dtypes)

url                           object
Week Position                  int64
Song                          object
Performer                     object
SongID                        object
Instance                       int64
Previous Week Position       float64
Peak Position                  int64
Weeks on Chart                 int64
spotify_genre                 object
spotify_track_id              object
spotify_track_preview_url     object
spotify_track_album           object
spotify_track_explicit        object
spotify_track_duration_ms    float64
spotify_track_popularity     float64
danceability                 float64
energy                       float64
key                          float64
loudness                     float64
mode                         float64
speechiness                  float64
acousticness                 float64
instrumentalness             float64
liveness                     float64
valence                      float64
tempo                        float64
t

In [44]:
songfeatures = songinfo.drop(columns=['url', 'Week Position','Instance', 'Previous Week Position', 'spotify_track_id', 'spotify_track_preview_url'])
songfeatures = songfeatures.sort_values('Weeks on Chart', ascending=False).drop_duplicates(['Song', 'SongID'])
songfeatures = songfeatures.reset_index()
songfeatures

Unnamed: 0,WeekID,Song,Performer,SongID,Peak Position,Weeks on Chart,spotify_genre,spotify_track_album,spotify_track_explicit,spotify_track_duration_ms,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,2015-11-28,Thinking Out Loud,Ed Sheeran,Thinking Out LoudEd Sheeran,2,58,"['pop', 'uk pop']",x (Deluxe Edition),False,281560.0,...,2.0,-6.061,1.0,0.0295,0.47400,0.000000,0.1840,0.5910,78.998,4.0
1,2016-03-05,Uptown Funk!,Mark Ronson Featuring Bruno Mars,Uptown Funk!Mark Ronson Featuring Bruno Mars,22,56,"['dance pop', 'pop']",Uptown Special,False,269666.0,...,0.0,-7.223,1.0,0.0824,0.00801,0.000081,0.0344,0.9280,114.988,4.0
2,2018-10-06,Perfect,Ed Sheeran,PerfectEd Sheeran,1,56,"['pop', 'uk pop']",÷ (Deluxe),False,263400.0,...,8.0,-6.312,1.0,0.0232,0.16300,0.000000,0.1060,0.1680,95.050,3.0
3,2015-05-02,Stay With Me,Sam Smith,Stay With MeSam Smith,2,54,"['pop', 'uk pop']",In The Lonely Hour,False,172723.0,...,0.0,-6.444,1.0,0.0414,0.58800,0.000064,0.1100,0.1840,84.094,4.0
4,2018-05-05,Wake Me Up!,Avicii,Wake Me Up!Avicii,34,54,"['big room', 'edm', 'pop']",TRUE,False,247426.0,...,2.0,-5.659,1.0,0.0524,0.00346,0.001870,0.1710,0.5880,124.102,4.0
5,2015-11-28,Shut Up And Dance,WALK THE MOON,Shut Up And DanceWALK THE MOON,4,53,"['indie pop', 'indie poptimism', 'modern rock'...",TALKING IS HARD (Expanded Edition),False,199080.0,...,1.0,-3.804,1.0,0.0619,0.00701,0.000000,0.2570,0.6190,128.038,4.0
6,2019-11-02,Sunflower (Spider-Man: Into The Spider-Verse),Post Malone & Swae Lee,Sunflower (Spider-Man: Into The Spider-Verse)P...,1,53,"['dfw rap', 'melodic rap', 'rap']",,,,...,,,,,,,,,,
7,2019-10-12,Without Me,Halsey,Without MeHalsey,1,52,"['dance pop', 'electropop', 'etherpop', 'indie...",Without Me,True,201660.0,...,6.0,-7.050,1.0,0.0705,0.29700,0.000009,0.0936,0.5330,136.041,4.0
8,2017-11-04,Say You Won't Let Go,James Arthur,Say You Won't Let GoJames Arthur,11,52,"['pop', 'post-teen pop', 'talent show', 'uk pop']",Back from the Edge,False,211466.0,...,10.0,-7.398,1.0,0.0590,0.69500,0.000000,0.0902,0.4940,85.043,4.0
9,2019-07-20,Eastside,"benny blanco, Halsey & Khalid","Eastsidebenny blanco, Halsey & Khalid",9,52,"['dance pop', 'pop', 'pop rap', 'tropical house']",Eastside (with Halsey & Khalid),False,173799.0,...,6.0,-7.648,0.0,0.3210,0.55500,0.000000,0.1160,0.3190,89.391,4.0


In [45]:
debutposition = songinfo[songinfo['Weeks on Chart'] == 1]
debutposition = debutposition.drop_duplicates(['Song', 'SongID'])
debutposition = debutposition[["Song", "SongID", "Week Position"]]
songfeatures = pd.merge(songfeatures, debutposition, how='inner', on=['SongID', 'Song'])

In [46]:
songfeatures = songfeatures.rename(columns={"Week Position": "Debut Position", "WeekID": "Week"})

# Population Dataset with Lyrics from LyricGenius (Genius API Python Library) 

In [47]:
songfeatures["Lyrics"] = 0

In [194]:
genius = lyricsgenius.Genius("lRSmSx2mp4ZU4ZovVyII_bDmIN2ROTR_0yU9HDCZI47VKNkZw-sBf5b4pruTYMam")
genius.remove_section_headers = True
genius.verbose = False
genius.skip_non_songs = True

In [51]:
for idx, values in songfeatures.iterrows():
    try:
        lyrics = genius.search_song(values['Song'],values['Performer']).lyrics
        songfeatures["Lyrics"][idx] = lyrics
    except:
        print("An exception occurred")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
An exception occurred
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
An exception occurred
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
An exception occurred
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
An exception occurred
1
1
1
1
An exception occurred
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
An exception occurred
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [79]:
songfeatures2 = songfeatures

In [65]:
deletes = songfeatures2[songfeatures2['Lyrics'] == 0].index
songfeatures2 = songfeatures2.drop(deletes, inplace=True)

In [78]:
# Export Dataset
songfeatures2.to_csv("Data/new.csv")

# Read in Cleaned Dataset from Excel & OpenRefine

In [45]:
cleaned = pd.read_csv("Data/new.csv")

In [46]:
songset = cleaned.iloc[:2306,1:26]

# Lyrics Sentiment Analysis 

In [509]:
songset["LyricPositive"] = 0
songset["LyricNeutral"] = 0
songset["LyricNegative"] = 0

In [512]:
sia = SentimentIntensityAnalyzer()
for idx, values in songset.iterrows():
    if detect(values['Lyrics']) == 'en':
        num_positive = 0
        num_neutral = 0
        num_negative = 0
        for line in values['Lyrics'].splitlines():
            if line != "":
                comp = sia.polarity_scores(line)   
                comp = comp['compound']
                if comp >= 0.5:
                    num_positive += 1
                elif comp > -0.5 and comp < 0.5:
                    num_neutral += 1
                else:
                    num_negative += 1
        num_total = num_negative + num_neutral + num_positive
        percent_negative = (num_negative/float(num_total))*100
        percent_neutral = (num_neutral/float(num_total))*100
        percent_positive = (num_positive/float(num_total))*100
        songset['LyricPositive'][idx] = percent_positive
        songset['LyricNeutral'][idx] = percent_neutral
        songset['LyricNegative'][idx] = percent_negative
songset

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Week,Song,Performer,SongID,Peak Position,Weeks on Chart,spotify_genre,spotify_track_album,spotify_track_explicit,spotify_track_duration_ms,...,instrumentalness,liveness,valence,tempo,time_signature,Debut Position,Lyrics,LyricPositive,LyricNeutral,LyricNegative
0,10/6/18,Perfect,Ed Sheeran,PerfectEd Sheeran,1.0,56.0,"['pop', 'uk pop']",÷ (Deluxe),False,263400.0,...,0.000000,0.1060,0.1680,95.050,3.0,37.0,"I found a love for me\nOh darling, just dive r...",47,52,0
2,10/12/19,Without Me,Halsey,Without MeHalsey,1.0,52.0,"['dance pop', 'electropop', 'etherpop', 'indie...",Without Me,True,201660.0,...,0.000009,0.0936,0.5330,136.041,4.0,18.0,Found you when your heart was broke\nI filled ...,1,98,0
3,11/4/17,Say You Won't Let Go,James Arthur,Say You Won't Let GoJames Arthur,11.0,52.0,"['pop', 'post-teen pop', 'talent show', 'uk pop']",Back from the Edge,False,211466.0,...,0.000000,0.0902,0.4940,85.043,4.0,100.0,I met you in the dark\nYou lit me up\nYou made...,12,87,0
4,8/10/19,Sicko Mode,Travis Scott,Sicko ModeTravis Scott,1.0,52.0,['rap'],ASTROWORLD,True,312820.0,...,0.000000,0.1240,0.4460,155.008,4.0,4.0,"Astro, yeah\nSun is down, freezin' cold\nThat'...",13,77,9
5,8/10/19,High Hopes,Panic! At The Disco,High HopesPanic! At The Disco,4.0,52.0,"['baroque pop', 'emo', 'modern rock']",Pray for the Wicked,False,190946.0,...,0.000000,0.0640,0.6810,82.014,4.0,98.0,"High, high hopes\n\nHad to have high, high hop...",7,90,1
6,8/24/19,Happier,Marshmello & Bastille,HappierMarshmello & Bastille,2.0,52.0,"['brostep', 'progressive electro house']",Happier,False,214289.0,...,0.000000,0.1670,0.6710,100.015,4.0,63.0,"Lately, I've been, I've been thinking\nI want ...",11,88,0
7,8/12/17,Closer,The Chainsmokers Featuring Halsey,CloserThe Chainsmokers Featuring Halsey,1.0,52.0,"['electropop', 'pop', 'tropical house']",Closer,False,244960.0,...,0.000000,0.1110,0.6610,95.010,4.0,9.0,"Hey, I was doing just fine before I met you\nI...",0,98,1
8,2/25/17,Don't Let Me Down,The Chainsmokers Featuring Daya,Don't Let Me DownThe Chainsmokers Featuring Daya,3.0,52.0,"['electropop', 'pop', 'tropical house']",Don't Let Me Down,False,208373.0,...,0.005080,0.1360,0.4220,159.803,4.0,85.0,"Crashing, hit a wall\nRight now I need a mirac...",9,90,0
9,3/19/16,See You Again,Wiz Khalifa Featuring Charlie Puth,See You AgainWiz Khalifa Featuring Charlie Puth,1.0,52.0,"['hip hop', 'pittsburgh rap', 'pop rap', 'rap'...",See You Again (feat. Charlie Puth),False,229525.0,...,0.000001,0.0649,0.2830,80.025,4.0,100.0,"It's been a long day without you, my friend\nA...",6,93,0
10,5/4/19,Better Now,Post Malone,Better NowPost Malone,3.0,52.0,"['dfw rap', 'melodic rap', 'rap']",beerbongs & bentleys,True,231266.0,...,0.000000,0.1360,0.3740,145.028,4.0,7.0,"You prolly think that you are better now, bett...",16,79,3


In [514]:
songset['LyricSentimentTotal'] = songset.apply(lambda row: row.LyricPositive + row.LyricNeutral + row.LyricNegative, axis=1)
songset.drop(songset[songset['LyricSentimentTotal'] == 0].index, inplace = True)
songset = songset.drop(columns=['LyricSentimentTotal'])
songset

Unnamed: 0,Week,Song,Performer,SongID,Peak Position,Weeks on Chart,spotify_genre,spotify_track_album,spotify_track_explicit,spotify_track_duration_ms,...,liveness,valence,tempo,time_signature,Debut Position,Lyrics,LyricPositive,LyricNeutral,LyricNegative,LyricSentimentTotal
0,10/6/18,Perfect,Ed Sheeran,PerfectEd Sheeran,1.0,56.0,"['pop', 'uk pop']",÷ (Deluxe),False,263400.0,...,0.1060,0.1680,95.050,3.0,37.0,"I found a love for me\nOh darling, just dive r...",47,52,0,99
2,10/12/19,Without Me,Halsey,Without MeHalsey,1.0,52.0,"['dance pop', 'electropop', 'etherpop', 'indie...",Without Me,True,201660.0,...,0.0936,0.5330,136.041,4.0,18.0,Found you when your heart was broke\nI filled ...,1,98,0,99
3,11/4/17,Say You Won't Let Go,James Arthur,Say You Won't Let GoJames Arthur,11.0,52.0,"['pop', 'post-teen pop', 'talent show', 'uk pop']",Back from the Edge,False,211466.0,...,0.0902,0.4940,85.043,4.0,100.0,I met you in the dark\nYou lit me up\nYou made...,12,87,0,99
4,8/10/19,Sicko Mode,Travis Scott,Sicko ModeTravis Scott,1.0,52.0,['rap'],ASTROWORLD,True,312820.0,...,0.1240,0.4460,155.008,4.0,4.0,"Astro, yeah\nSun is down, freezin' cold\nThat'...",13,77,9,99
5,8/10/19,High Hopes,Panic! At The Disco,High HopesPanic! At The Disco,4.0,52.0,"['baroque pop', 'emo', 'modern rock']",Pray for the Wicked,False,190946.0,...,0.0640,0.6810,82.014,4.0,98.0,"High, high hopes\n\nHad to have high, high hop...",7,90,1,98
6,8/24/19,Happier,Marshmello & Bastille,HappierMarshmello & Bastille,2.0,52.0,"['brostep', 'progressive electro house']",Happier,False,214289.0,...,0.1670,0.6710,100.015,4.0,63.0,"Lately, I've been, I've been thinking\nI want ...",11,88,0,99
7,8/12/17,Closer,The Chainsmokers Featuring Halsey,CloserThe Chainsmokers Featuring Halsey,1.0,52.0,"['electropop', 'pop', 'tropical house']",Closer,False,244960.0,...,0.1110,0.6610,95.010,4.0,9.0,"Hey, I was doing just fine before I met you\nI...",0,98,1,99
8,2/25/17,Don't Let Me Down,The Chainsmokers Featuring Daya,Don't Let Me DownThe Chainsmokers Featuring Daya,3.0,52.0,"['electropop', 'pop', 'tropical house']",Don't Let Me Down,False,208373.0,...,0.1360,0.4220,159.803,4.0,85.0,"Crashing, hit a wall\nRight now I need a mirac...",9,90,0,99
9,3/19/16,See You Again,Wiz Khalifa Featuring Charlie Puth,See You AgainWiz Khalifa Featuring Charlie Puth,1.0,52.0,"['hip hop', 'pittsburgh rap', 'pop rap', 'rap'...",See You Again (feat. Charlie Puth),False,229525.0,...,0.0649,0.2830,80.025,4.0,100.0,"It's been a long day without you, my friend\nA...",6,93,0,99
10,5/4/19,Better Now,Post Malone,Better NowPost Malone,3.0,52.0,"['dfw rap', 'melodic rap', 'rap']",beerbongs & bentleys,True,231266.0,...,0.1360,0.3740,145.028,4.0,7.0,"You prolly think that you are better now, bett...",16,79,3,98


In [524]:
songset.to_csv("Data/UpdatedData.csv")

# Manual Lyric Data Cleaning

In [330]:
for idx, values in songset.iterrows():
    if (values['Lyrics'].count(' - ') > 10):
        print(idx, values['Song'])
    

In [698]:
songset.loc[songset['Song'] == "Work From Home"]
# check['Lyrics']

Unnamed: 0,Week,Song,Performer,SongID,Peak Position,Weeks on Chart,spotify_genre,spotify_track_album,spotify_track_explicit,spotify_track_duration_ms,...,nu gaze,chinese hip hop,chinese idol pop,k-hop,post-metal,progressive metal,progressive rock,anthem worship,world worship,pop emo
100,11/5/16,Work From Home,Fifth Harmony Featuring Ty Dolla $ign,Work From HomeFifth Harmony Featuring Ty Dolla...,4.0,34.0,"['dance pop', 'girl group', 'pop', 'post-teen ...",7/27 (Deluxe),False,214480.0,...,0,0,0,0,0,0,0,0,0,0


In [490]:
# Delete Song from Dataset
songset.drop(songset[songset['Song'] == "Make It Right"].index, inplace = True) 

In [344]:
# Delete Songs with Empty Spotify Data from Dataset
songset = songset.dropna(axis=0, subset=['spotify_track_duration_ms'])

In [345]:
# Checks Dataset for Empty Spotify Data
songset['spotify_track_duration_ms'].isnull().sum()

0

In [542]:
# Recheck Lyrics
genius.search_song("Despacito", 'Justin').lyrics

'Comin\' over in my direction\nSo thankful for that, it\'s such a blessin\', yeah\nTurn every situation into heaven, yeah\nOh-oh, you are\nMy sunrise on the darkest day\nGot me feelin\' some kind of way\nMake me wanna savor every moment slowly, slowly\nYou fit me tailor-made, love how you put it on\nGot the only key, know how to turn it on\nThe way you nibble on my ear, the only words I wanna hear\nBaby, take it slow so we can last long\n\n¡Oh! Tú, tú eres el imán y yo soy el metal\nMe voy acercando y voy armando el plan\nSólo con pensarlo se acelera el pulso (Oh, yeah)\nYa, ya me está gustando más de lo normal\nTodos mis sentidos van pidiendo más\nEsto hay que tomarlo sin ningún apuro\n\nDespacito\nQuiero respirar tu cuello despacito\nDeja que te diga cosas al oído\nPara que te acuerdes si no estás conmigo\nDespacito\nQuiero desnudarte a besos despacito\nFirmo en las paredes de tu laberinto\nY hacer de tu cuerpo todo un manuscrito\n(Sube, sube, sube\nSube, sube)\n\nQuiero ver bailar t

In [543]:
# Update Lyrics if previously incorrect
songset.at[21,'Lyrics']= genius.search_song("Despacito", 'Justin').lyrics

In [None]:
songset = songset.reset_index()
songset

# Twitter Data Collection

In [630]:
# Creates New Twitter Dataset with Sentiment Fields
twitter = songset[['Week', 'Song', 'Performer', 'SongID']]
twitter['Tweets'] = 0
twitter['TweetPositive'] = 0
twitter['TweetNeutral'] = 0
twitter['TweetNegative'] = 0
twitter

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

Unnamed: 0,Week,Song,Performer,SongID,Tweets,TweetPositive,TweetNeutral,TweetNegative
0,10/6/18,Perfect,Ed Sheeran,PerfectEd Sheeran,0,0,0,0
1,10/12/19,Without Me,Halsey,Without MeHalsey,0,0,0,0
2,11/4/17,Say You Won't Let Go,James Arthur,Say You Won't Let GoJames Arthur,0,0,0,0
3,8/10/19,Sicko Mode,Travis Scott,Sicko ModeTravis Scott,0,0,0,0
4,8/10/19,High Hopes,Panic! At The Disco,High HopesPanic! At The Disco,0,0,0,0
5,8/24/19,Happier,Marshmello & Bastille,HappierMarshmello & Bastille,0,0,0,0
6,8/12/17,Closer,The Chainsmokers Featuring Halsey,CloserThe Chainsmokers Featuring Halsey,0,0,0,0
7,2/25/17,Don't Let Me Down,The Chainsmokers Featuring Daya,Don't Let Me DownThe Chainsmokers Featuring Daya,0,0,0,0
8,3/19/16,See You Again,Wiz Khalifa Featuring Charlie Puth,See You AgainWiz Khalifa Featuring Charlie Puth,0,0,0,0
9,5/4/19,Better Now,Post Malone,Better NowPost Malone,0,0,0,0


In [705]:
# Twitter Data Collection
iteration = 0
for idx, values in twitter.iterrows():
    if iteration == 15:
        time.sleep(300) # Makes the operation sleep for 5 minutes as to not overload API with a large number of requests
        iteration = 0
        print('New Iteration')
    tweets = []
    d = dt.timedelta(days=14)
    enddate = dt.datetime.strptime(values['Week'], "%m/%d/%y")
    startdate = enddate-d
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(values['Song'] + " " + values['Performer']).setSince(startdate.strftime("%Y-%m-%d")).setUntil(enddate.strftime("%Y-%m-%d")).setMaxTweets(100)
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)
    stopwords12 = set(stopwords.words('english') + list(punctuation))
    for x in tweet:
        x = x.text
        x = x.lower() # convert text to lower-case
        x = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', x) # remove URLs
        x = re.sub('@[^\s]+', '', x) # remove usernames
        x = re.sub(r'#([^\s]+)', r'\1', x) # remove the # in #hashtag
        try:
            # Checks if the tweets are in english for sentiment analysis
            if detect(x) == 'en':
                tweets.append(x)
        except:
            print("LangDetect Error")
    twitter['Tweets'][idx] = tweets
    iteration+=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Pay You Back
1938
Home
1939
Let It All Work Out
1940
On The Regular
1941
Jump Out The Window
1942
Issues
1943
Not Afraid Anymore
1944
Mood Swings
1945
Love Is Your Name
1946
Like That
1947
Jonestown (Interlude)
1948
Let Me
1949
Man Of The Woods
1950
Gang Gang
1951
Dreamcatcher
1952
New Iteration
Jet Lag
1953
If You Know You Know
1954
KIDS SEE GHOSTS
1955
Millidelphia
1956
Fr Fr
1957
Love Scars 3
1958
Halfway Off The Balcony
1959
Midnight
1960
Mess
1961
OMG
1962
Must've Never Met You
1963
False God
1964
Let It Sing
1965
No Name
1966
My Blood
1967
New Iteration
For Real
1968
Infinity
1969
Foot Fungus
1970
Nikes
1971
Hello Friday
1972
Let Me Know (I Wonder Why Freestyle)
1973
Liability
1974
Make Love
1975
Fire N Gold
1976
No More
1977
Nobody But You
1978
Failure
1979
Ivy
1980
Its Every Night Sis
1981
Found You
1982
New Iteration
My My My!
1983
New Man
1984
Get Right Witcha
1985
Duckworth.
1986
Forward
1987
Is That Alright?
1988
Body Say
1989
Saturday Nights
1990
Sangria Wine
1991
1942 Flo

In [726]:
for idx, values in twitter.iterrows():
    newtweets = []
    song = values['Song'].lower()
    performer = values['Performer'].lower()
    for x in values['Tweets']:
        text = x.replace(song, "").replace(performer, "").replace('-','').strip()
        if text:
            newtweets.append(text)
    twitter['Tweets'][idx] = newtweets
    num_positive = 0
    num_neutral = 0
    num_negative = 0
    for x in newtweets:
        if x != "":
            comp = sia.polarity_scores(x)   
            comp = comp['compound']
            if comp >= 0.5:
                num_positive += 1
            elif comp > -0.5 and comp < 0.5:
                num_neutral += 1
            else:
                num_negative += 1
    num_total = num_negative + num_neutral + num_positive
    if num_total != 0:
        percent_negative = (num_negative/float(num_total))*100
        percent_neutral = (num_neutral/float(num_total))*100
        percent_positive = (num_positive/float(num_total))*100
        twitter['TweetPositive'][idx] = percent_positive
        twitter['TweetNeutral'][idx] = percent_neutral
        twitter['TweetNegative'][idx] = percent_negative

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [745]:
twitter.to_csv("Data/TwitterData.csv")

In [752]:
twitter = pd.read_csv("Data/TwitterData.csv")

In [753]:
twitter['TweetSentimentTotal'] = twitter.apply(lambda row: row.TweetPositive + row.TweetNeutral + row.TweetNegative, axis=1)
zero = []
for idx, values in twitter.iterrows():
    if values['TweetSentimentTotal'] == 0:
        zero.append(values['TweetSentimentTotal'])
len(zero)


141

In [754]:
songset = pd.merge(songset, twitter, how='inner', on=['SongID'])
songset

Unnamed: 0.1,Week_x,Song_x,Performer_x,SongID,Peak Position,Weeks on Chart,spotify_genre,spotify_track_album,spotify_track_explicit,spotify_track_duration_ms,...,pop emo,Unnamed: 0,Week_y,Song_y,Performer_y,Tweets,TweetPositive,TweetNeutral,TweetNegative,TweetSentimentTotal
0,10/6/18,Perfect,Ed Sheeran,PerfectEd Sheeran,1.0,56.0,"['pop', 'uk pop']",÷ (Deluxe),False,263400.0,...,0,0,10/6/18,Perfect,Ed Sheeran,"['rfct d shran', 'rfct by d shran will lay at...",2,96,0,98
1,10/12/19,Without Me,Halsey,Without MeHalsey,1.0,52.0,"['dance pop', 'electropop', 'etherpop', 'indie...",Without Me,True,201660.0,...,0,1,10/12/19,Without Me,Halsey,['♫itout me by from soundound it livelyrics®'...,14,82,3,99
2,11/4/17,Say You Won't Let Go,James Arthur,Say You Won't Let GoJames Arthur,11.0,52.0,"['pop', 'post-teen pop', 'talent show', 'uk pop']",Back from the Edge,False,211466.0,...,0,2,11/4/17,Say You Won't Let Go,James Arthur,['ay you won’t let go by ame arthur never. get...,11,88,0,99
3,8/10/19,Sicko Mode,Travis Scott,Sicko ModeTravis Scott,1.0,52.0,['rap'],ASTROWORLD,True,312820.0,...,0,3,8/10/19,Sicko Mode,Travis Scott,['aroworld ravi co 1° icko mode 2° yoemie 3° ...,6,92,0,98
4,8/10/19,High Hopes,Panic! At The Disco,High HopesPanic! At The Disco,4.0,52.0,"['baroque pop', 'emo', 'modern rock']",Pray for the Wicked,False,190946.0,...,0,4,8/10/19,High Hopes,Panic! At The Disco,['for a camaign rally anic! at te disco ig oe...,7,89,2,98
5,8/24/19,Happier,Marshmello & Bastille,HappierMarshmello & Bastille,2.0,52.0,"['brostep', 'progressive electro house']",Happier,False,214289.0,...,0,5,8/24/19,Happier,Marshmello & Bastille,['appier (ikeii reix) arsello;bastille nowplay...,6,93,0,99
6,8/12/17,Closer,The Chainsmokers Featuring Halsey,CloserThe Chainsmokers Featuring Halsey,1.0,52.0,"['electropop', 'pop', 'tropical house']",Closer,False,244960.0,...,0,6,8/12/17,Closer,The Chainsmokers Featuring Halsey,['nowplaying loser he hainsmokers feauring ha...,0,14,85,99
7,2/25/17,Don't Let Me Down,The Chainsmokers Featuring Daya,Don't Let Me DownThe Chainsmokers Featuring Daya,3.0,52.0,"['electropop', 'pop', 'tropical house']",Don't Let Me Down,False,208373.0,...,0,7,2/25/17,Don't Let Me Down,The Chainsmokers Featuring Daya,"['he chainsmokers feauring aya, ‘on’ le me own...",38,61,0,99
8,3/19/16,See You Again,Wiz Khalifa Featuring Charlie Puth,See You AgainWiz Khalifa Featuring Charlie Puth,1.0,52.0,"['hip hop', 'pittsburgh rap', 'pop rap', 'rap'...",See You Again (feat. Charlie Puth),False,229525.0,...,0,8,3/19/16,See You Again,Wiz Khalifa Featuring Charlie Puth,['nu op klinkradio ee you again iz khalifa fe...,0,100,0,100
9,5/4/19,Better Now,Post Malone,Better NowPost Malone,3.0,52.0,"['dfw rap', 'melodic rap', 'rap']",beerbongs & bentleys,True,231266.0,...,0,9,5/4/19,Better Now,Post Malone,['𝙢𝙪́𝙨𝙞𝙘𝙖𝙨 𝙦𝙪𝙚 𝙚𝙪 𝙢𝙖𝙞𝙨 𝙚𝙨𝙘𝙪𝙩𝙤 1. etter now (os...,6,92,1,99


In [756]:
songset = songset.drop(columns=['Unnamed: 0', 'Week_y', 'Song_y', 'Performer_y'])
songset = songset.rename(columns={"Week_x": "Week", "Song_x": "Song", "Performer_x": "Performer"})
songset

Unnamed: 0,Week,Song,Performer,SongID,Peak Position,Weeks on Chart,spotify_genre,spotify_track_album,spotify_track_explicit,spotify_track_duration_ms,...,progressive metal,progressive rock,anthem worship,world worship,pop emo,Tweets,TweetPositive,TweetNeutral,TweetNegative,TweetSentimentTotal
0,10/6/18,Perfect,Ed Sheeran,PerfectEd Sheeran,1.0,56.0,"['pop', 'uk pop']",÷ (Deluxe),False,263400.0,...,0,0,0,0,0,"['rfct d shran', 'rfct by d shran will lay at...",2,96,0,98
1,10/12/19,Without Me,Halsey,Without MeHalsey,1.0,52.0,"['dance pop', 'electropop', 'etherpop', 'indie...",Without Me,True,201660.0,...,0,0,0,0,0,['♫itout me by from soundound it livelyrics®'...,14,82,3,99
2,11/4/17,Say You Won't Let Go,James Arthur,Say You Won't Let GoJames Arthur,11.0,52.0,"['pop', 'post-teen pop', 'talent show', 'uk pop']",Back from the Edge,False,211466.0,...,0,0,0,0,0,['ay you won’t let go by ame arthur never. get...,11,88,0,99
3,8/10/19,Sicko Mode,Travis Scott,Sicko ModeTravis Scott,1.0,52.0,['rap'],ASTROWORLD,True,312820.0,...,0,0,0,0,0,['aroworld ravi co 1° icko mode 2° yoemie 3° ...,6,92,0,98
4,8/10/19,High Hopes,Panic! At The Disco,High HopesPanic! At The Disco,4.0,52.0,"['baroque pop', 'emo', 'modern rock']",Pray for the Wicked,False,190946.0,...,0,0,0,0,0,['for a camaign rally anic! at te disco ig oe...,7,89,2,98
5,8/24/19,Happier,Marshmello & Bastille,HappierMarshmello & Bastille,2.0,52.0,"['brostep', 'progressive electro house']",Happier,False,214289.0,...,0,0,0,0,0,['appier (ikeii reix) arsello;bastille nowplay...,6,93,0,99
6,8/12/17,Closer,The Chainsmokers Featuring Halsey,CloserThe Chainsmokers Featuring Halsey,1.0,52.0,"['electropop', 'pop', 'tropical house']",Closer,False,244960.0,...,0,0,0,0,0,['nowplaying loser he hainsmokers feauring ha...,0,14,85,99
7,2/25/17,Don't Let Me Down,The Chainsmokers Featuring Daya,Don't Let Me DownThe Chainsmokers Featuring Daya,3.0,52.0,"['electropop', 'pop', 'tropical house']",Don't Let Me Down,False,208373.0,...,0,0,0,0,0,"['he chainsmokers feauring aya, ‘on’ le me own...",38,61,0,99
8,3/19/16,See You Again,Wiz Khalifa Featuring Charlie Puth,See You AgainWiz Khalifa Featuring Charlie Puth,1.0,52.0,"['hip hop', 'pittsburgh rap', 'pop rap', 'rap'...",See You Again (feat. Charlie Puth),False,229525.0,...,0,0,0,0,0,['nu op klinkradio ee you again iz khalifa fe...,0,100,0,100
9,5/4/19,Better Now,Post Malone,Better NowPost Malone,3.0,52.0,"['dfw rap', 'melodic rap', 'rap']",beerbongs & bentleys,True,231266.0,...,0,0,0,0,0,['𝙢𝙪́𝙨𝙞𝙘𝙖𝙨 𝙦𝙪𝙚 𝙚𝙪 𝙢𝙖𝙞𝙨 𝙚𝙨𝙘𝙪𝙩𝙤 1. etter now (os...,6,92,1,99


In [757]:
songset.drop(songset[songset['TweetSentimentTotal'] == 0].index, inplace = True)
songset = songset.drop(columns=['TweetSentimentTotal'])
songset = songset.reset_index()
songset = songset.drop(columns=['index'])
songset

Unnamed: 0,Week,Song,Performer,SongID,Peak Position,Weeks on Chart,spotify_genre,spotify_track_album,spotify_track_explicit,spotify_track_duration_ms,...,post-metal,progressive metal,progressive rock,anthem worship,world worship,pop emo,Tweets,TweetPositive,TweetNeutral,TweetNegative
0,10/6/18,Perfect,Ed Sheeran,PerfectEd Sheeran,1.0,56.0,"['pop', 'uk pop']",÷ (Deluxe),False,263400.0,...,0,0,0,0,0,0,"['rfct d shran', 'rfct by d shran will lay at...",2,96,0
1,10/12/19,Without Me,Halsey,Without MeHalsey,1.0,52.0,"['dance pop', 'electropop', 'etherpop', 'indie...",Without Me,True,201660.0,...,0,0,0,0,0,0,['♫itout me by from soundound it livelyrics®'...,14,82,3
2,11/4/17,Say You Won't Let Go,James Arthur,Say You Won't Let GoJames Arthur,11.0,52.0,"['pop', 'post-teen pop', 'talent show', 'uk pop']",Back from the Edge,False,211466.0,...,0,0,0,0,0,0,['ay you won’t let go by ame arthur never. get...,11,88,0
3,8/10/19,Sicko Mode,Travis Scott,Sicko ModeTravis Scott,1.0,52.0,['rap'],ASTROWORLD,True,312820.0,...,0,0,0,0,0,0,['aroworld ravi co 1° icko mode 2° yoemie 3° ...,6,92,0
4,8/10/19,High Hopes,Panic! At The Disco,High HopesPanic! At The Disco,4.0,52.0,"['baroque pop', 'emo', 'modern rock']",Pray for the Wicked,False,190946.0,...,0,0,0,0,0,0,['for a camaign rally anic! at te disco ig oe...,7,89,2
5,8/24/19,Happier,Marshmello & Bastille,HappierMarshmello & Bastille,2.0,52.0,"['brostep', 'progressive electro house']",Happier,False,214289.0,...,0,0,0,0,0,0,['appier (ikeii reix) arsello;bastille nowplay...,6,93,0
6,8/12/17,Closer,The Chainsmokers Featuring Halsey,CloserThe Chainsmokers Featuring Halsey,1.0,52.0,"['electropop', 'pop', 'tropical house']",Closer,False,244960.0,...,0,0,0,0,0,0,['nowplaying loser he hainsmokers feauring ha...,0,14,85
7,2/25/17,Don't Let Me Down,The Chainsmokers Featuring Daya,Don't Let Me DownThe Chainsmokers Featuring Daya,3.0,52.0,"['electropop', 'pop', 'tropical house']",Don't Let Me Down,False,208373.0,...,0,0,0,0,0,0,"['he chainsmokers feauring aya, ‘on’ le me own...",38,61,0
8,3/19/16,See You Again,Wiz Khalifa Featuring Charlie Puth,See You AgainWiz Khalifa Featuring Charlie Puth,1.0,52.0,"['hip hop', 'pittsburgh rap', 'pop rap', 'rap'...",See You Again (feat. Charlie Puth),False,229525.0,...,0,0,0,0,0,0,['nu op klinkradio ee you again iz khalifa fe...,0,100,0
9,5/4/19,Better Now,Post Malone,Better NowPost Malone,3.0,52.0,"['dfw rap', 'melodic rap', 'rap']",beerbongs & bentleys,True,231266.0,...,0,0,0,0,0,0,['𝙢𝙪́𝙨𝙞𝙘𝙖𝙨 𝙦𝙪𝙚 𝙚𝙪 𝙢𝙖𝙞𝙨 𝙚𝙨𝙘𝙪𝙩𝙤 1. etter now (os...,6,92,1


# LDA Topic Modeling for Lyrics

In [1237]:
topics = songset[['Week', 'Song', 'Performer', 'SongID', 'Lyrics']]

In [1238]:
topics.to_csv("Data/LDA.csv")

In [884]:
def lemmatize_stemming(text):
    stemmer = PorterStemmer()
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [885]:
words = []
for lyrics in topics['Lyrics']:
    for word in gensim.utils.simple_preprocess(lyrics):
        if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 3:
            words.append(lemmatize_stemming(word))
add_stop_words = [word for word, count in Counter(words).most_common() if count > 750]
add_stop_words

['yeah',
 'like',
 'know',
 'love',
 'nigga',
 'bitch',
 'babi',
 'want',
 'wanna',
 'caus',
 'come',
 'fuck',
 'time',
 'tell',
 'feel',
 'need',
 'shit',
 'girl',
 'right',
 'think',
 'look',
 'night',
 'life',
 'money',
 'good',
 'go',
 'say',
 'gotta',
 'leav',
 'thing',
 'gonna',
 'break',
 'heart',
 'talk',
 'real',
 'littl',
 'light',
 'turn',
 'mind',
 'better',
 'bout',
 'hold',
 'tryna',
 'woah',
 'play',
 'lose',
 'away',
 'work',
 'hear',
 'stay',
 'high',
 'friend']

In [886]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            token = lemmatize_stemming(token)
            if token not in add_stop_words:
                result.append(token)
    return result

In [887]:
# Example
doc_sample = songset['Lyrics'][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['I', 'found', 'a', 'love', 'for', 'me\nOh', 'darling,', 'just', 'dive', 'right', 'in', 'and', 'follow', 'my', 'lead\nWell,', 'I', 'found', 'a', 'girl,', 'beautiful', 'and', 'sweet\nOh,', 'I', 'never', 'knew', 'you', 'were', 'the', 'someone', 'waiting', 'for', "me\n'Cause", 'we', 'were', 'just', 'kids', 'when', 'we', 'fell', 'in', 'love\nNot', 'knowing', 'what', 'it', 'was\nI', 'will', 'not', 'give', 'you', 'up', 'this', 'time\nBut', 'darling,', 'just', 'kiss', 'me', 'slow,', 'your', 'heart', 'is', 'all', 'I', 'own\nAnd', 'in', 'your', 'eyes,', "you're", 'holding', 'mine\n\nBaby,', "I'm", 'dancing', 'in', 'the', 'dark', 'with', 'you', 'between', 'my', 'arms\nBarefoot', 'on', 'the', 'grass,', 'listening', 'to', 'our', 'favourite', 'song\nWhen', 'you', 'said', 'you', 'looked', 'a', 'mess,', 'I', 'whispered', 'underneath', 'my', 'breath\nBut', 'you', 'heard', 'it,', 'darling,', 'you', 'look', 'perfect', 'tonight\n\nWell', 'I', 'found', 'a', 'woman,', 'stronger', 'than'

In [888]:
processed_docs = topics['Lyrics'].map(preprocess)

In [889]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 alright
1 angel
2 arm
3 barefoot
4 beauti
5 breath
6 carri
7 children
8 danc
9 dark
10 darl


In [890]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [891]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[1000]

[(14, 9),
 (50, 7),
 (53, 1),
 (55, 3),
 (57, 7),
 (60, 1),
 (90, 6),
 (95, 1),
 (101, 1),
 (121, 1),
 (122, 1),
 (124, 1),
 (170, 1),
 (171, 1),
 (195, 2),
 (227, 2),
 (252, 1),
 (259, 2),
 (280, 2),
 (288, 10),
 (293, 11),
 (331, 1),
 (359, 4),
 (363, 2),
 (389, 1),
 (392, 1),
 (419, 1),
 (457, 7),
 (509, 1),
 (515, 6),
 (581, 1),
 (612, 1),
 (613, 1),
 (644, 1),
 (660, 1),
 (708, 1),
 (733, 1),
 (775, 1),
 (780, 1),
 (786, 1),
 (823, 1),
 (870, 1),
 (981, 1),
 (984, 1),
 (1079, 7),
 (1116, 2),
 (1247, 1),
 (1265, 2),
 (1276, 1),
 (1280, 7),
 (1283, 1),
 (1343, 1),
 (1516, 1)]

In [892]:
# Example
bow_doc_1000 = bow_corpus[1000]
for i in range(len(bow_doc_1000)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_1000[i][0], 
                                               dictionary[bow_doc_1000[i][0]], 
bow_doc_1000[i][1]))

Word 14 ("eye") appears 9 time.
Word 50 ("feet") appears 7 time.
Word 53 ("knee") appears 1 time.
Word 55 ("live") appears 3 time.
Word 57 ("notic") appears 7 time.
Word 60 ("take") appears 1 time.
Word 90 ("pull") appears 6 time.
Word 95 ("show") appears 1 time.
Word 101 ("thank") appears 1 time.
Word 121 ("color") appears 1 time.
Word 122 ("comin") appears 1 time.
Word 124 ("cover") appears 1 time.
Word 170 ("pay") appears 1 time.
Word 171 ("pick") appears 1 time.
Word 195 ("skrrt") appears 2 time.
Word 227 ("vision") appears 2 time.
Word 252 ("citi") appears 1 time.
Word 259 ("lookin") appears 2 time.
Word 280 ("line") appears 2 time.
Word 288 ("ride") appears 10 time.
Word 293 ("vibe") appears 11 time.
Word 331 ("sell") appears 1 time.
Word 359 ("drop") appears 4 time.
Word 363 ("gettin") appears 2 time.
Word 389 ("talkin") appears 1 time.
Word 392 ("trap") appears 1 time.
Word 419 ("stage") appears 1 time.
Word 457 ("jewelri") appears 7 time.
Word 509 ("blessin") appears 1 time.
W

In [893]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.0644698216237131),
 (1, 0.08141084123965442),
 (2, 0.2350585332622832),
 (3, 0.1501578689656587),
 (4, 0.06342198662439767),
 (5, 0.27498066566868995),
 (6, 0.0952552338448384),
 (7, 0.16575373645465158),
 (8, 0.190265959873193),
 (9, 0.4795533250606754),
 (10, 0.1790648863128046),
 (11, 0.1019225437034953),
 (12, 0.05250122669940121),
 (13, 0.07147098836064643),
 (14, 0.08524413385145616),
 (15, 0.087578562443839),
 (16, 0.1642939057838116),
 (17, 0.08105110903044707),
 (18, 0.05978330373304647),
 (19, 0.0768089706262761),
 (20, 0.08902880895202042),
 (21, 0.30081634227937415),
 (22, 0.041502716952034016),
 (23, 0.043571756934430175),
 (24, 0.05688433421655866),
 (25, 0.14015418925110587),
 (26, 0.05093030740614705),
 (27, 0.08853555970161685),
 (28, 0.19782718081056805),
 (29, 0.07741461667443125),
 (30, 0.29386248649317054),
 (31, 0.0880522757673804),
 (32, 0.08450181082891385),
 (33, 0.18678051606602014),
 (34, 0.06145470812872682),
 (35, 0.1591459765561435),
 (36, 0.1099715

In [894]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.013*"gang" + 0.010*"babe" + 0.010*"blame" + 0.008*"care" + 0.007*"fall" + 0.007*"game" + 0.007*"pain" + 0.006*"save" + 0.006*"long" + 0.006*"gucci"
Topic: 1 
Words: 0.015*"everybodi" + 0.011*"kiss" + 0.011*"take" + 0.010*"crazi" + 0.008*"hell" + 0.008*"stick" + 0.007*"wait" + 0.006*"long" + 0.006*"nice" + 0.006*"cash"
Topic: 2 
Words: 0.012*"chang" + 0.010*"whip" + 0.009*"wrist" + 0.007*"parti" + 0.007*"aliv" + 0.007*"danc" + 0.007*"take" + 0.007*"stack" + 0.007*"skrrt" + 0.007*"differ"
Topic: 3 
Words: 0.013*"walk" + 0.013*"eye" + 0.012*"tonight" + 0.012*"alright" + 0.011*"home" + 0.010*"slide" + 0.008*"beauti" + 0.008*"tast" + 0.007*"wrong" + 0.007*"ride"
Topic: 4 
Words: 0.011*"rich" + 0.010*"wait" + 0.009*"young" + 0.008*"live" + 0.008*"level" + 0.007*"smile" + 0.007*"talkin" + 0.006*"wake" + 0.006*"stop" + 0.005*"best"
Topic: 5 
Words: 0.012*"pull" + 0.012*"bodi" + 0.009*"somethin" + 0.008*"get" + 0.008*"shoot" + 0.008*"hand" + 0.007*"miss" + 0.006*"float" + 0.0

In [895]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.005*"fall" + 0.005*"mean" + 0.005*"wast" + 0.005*"world" + 0.004*"live" + 0.004*"touch" + 0.004*"boy" + 0.004*"wouldn" + 0.004*"promis" + 0.004*"ocean"
Topic: 1 Word: 0.005*"alright" + 0.005*"danc" + 0.004*"kiss" + 0.004*"eye" + 0.004*"take" + 0.004*"chang" + 0.004*"feelin" + 0.003*"song" + 0.003*"hand" + 0.003*"slide"
Topic: 2 Word: 0.004*"believ" + 0.004*"watch" + 0.004*"worth" + 0.003*"long" + 0.003*"gang" + 0.003*"aliv" + 0.003*"fall" + 0.003*"walk" + 0.003*"live" + 0.003*"round"
Topic: 3 Word: 0.005*"blame" + 0.004*"hurt" + 0.004*"mean" + 0.004*"pretti" + 0.003*"shoot" + 0.003*"kiss" + 0.003*"stop" + 0.003*"breath" + 0.003*"drink" + 0.003*"fall"
Topic: 4 Word: 0.007*"hallelujah" + 0.005*"lie" + 0.004*"smoke" + 0.004*"home" + 0.004*"diamond" + 0.004*"touch" + 0.004*"drug" + 0.004*"long" + 0.003*"blow" + 0.003*"goodby"
Topic: 5 Word: 0.006*"sorri" + 0.005*"wake" + 0.005*"ghost" + 0.004*"whoa" + 0.004*"damn" + 0.004*"drink" + 0.004*"walk" + 0.004*"song" + 0.004*"wish

In [896]:
# Testing
unseen_document = "christmas cheer christmas christmas christmas"
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

Score: 0.8199850916862488	 Topic: 0.005*"alright" + 0.005*"danc" + 0.004*"kiss" + 0.004*"eye" + 0.004*"take"
Score: 0.020005928352475166	 Topic: 0.005*"world" + 0.005*"young" + 0.004*"blue" + 0.004*"fast" + 0.004*"okay"
Score: 0.020001839846372604	 Topic: 0.004*"believ" + 0.004*"watch" + 0.004*"worth" + 0.003*"long" + 0.003*"gang"
Score: 0.020001566037535667	 Topic: 0.005*"lone" + 0.005*"fall" + 0.004*"whoa" + 0.004*"touch" + 0.004*"save"
Score: 0.020001035183668137	 Topic: 0.008*"somebodi" + 0.004*"bodi" + 0.004*"everybodi" + 0.004*"tonight" + 0.004*"talkin"
Score: 0.02000098116695881	 Topic: 0.007*"hallelujah" + 0.005*"lie" + 0.004*"smoke" + 0.004*"home" + 0.004*"diamond"
Score: 0.02000093273818493	 Topic: 0.006*"sorri" + 0.005*"wake" + 0.005*"ghost" + 0.004*"whoa" + 0.004*"damn"
Score: 0.020000915974378586	 Topic: 0.005*"blame" + 0.004*"hurt" + 0.004*"mean" + 0.004*"pretti" + 0.003*"shoot"
Score: 0.02000085636973381	 Topic: 0.005*"fall" + 0.005*"mean" + 0.005*"wast" + 0.005*"world" 

In [897]:
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Topic "+str(index), score)

Topic 1 0.8199851
Topic 8 0.02000594
Topic 2 0.020001838
Topic 6 0.020001564
Topic 9 0.020001033
Topic 4 0.02000098
Topic 5 0.02000093
Topic 3 0.020000914
Topic 0 0.020000853
Topic 7 0.020000849


In [906]:
songset.drop(columns=['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 0'])
songset['Topic 0'] = 0.0
songset['Topic 1'] = 0.0
songset['Topic 2'] = 0.0
songset['Topic 3'] = 0.0
songset['Topic 4'] = 0.0
songset['Topic 5'] = 0.0
songset['Topic 6'] = 0.0
songset['Topic 7'] = 0.0
songset['Topic 8'] = 0.0
songset['Topic 9'] = 0.0

In [913]:
for idx, values in songset.iterrows():
    bow_vector = dictionary.doc2bow(preprocess(values['Lyrics']))
    for indx, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
        songset['Topic ' + str(indx)][idx] = score

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [914]:
songset

Unnamed: 0,Week,Song,Performer,SongID,Peak Position,Weeks on Chart,spotify_genre,spotify_track_album,spotify_track_explicit,spotify_track_duration_ms,...,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 0
0,10/6/18,Perfect,Ed Sheeran,PerfectEd Sheeran,1.0,56.0,"['pop', 'uk pop']",÷ (Deluxe),False,263400.0,...,0.338805,0.000000,0.000000,0.000000,0.422254,0.172768,0.057950,0.000000,0.000000,0.000000
1,10/12/19,Without Me,Halsey,Without MeHalsey,1.0,52.0,"['dance pop', 'electropop', 'etherpop', 'indie...",Without Me,True,201660.0,...,0.000000,0.819213,0.000000,0.000000,0.000000,0.165687,0.000000,0.000000,0.000000,0.000000
2,11/4/17,Say You Won't Let Go,James Arthur,Say You Won't Let GoJames Arthur,11.0,52.0,"['pop', 'post-teen pop', 'talent show', 'uk pop']",Back from the Edge,False,211466.0,...,0.280972,0.000000,0.000000,0.000000,0.126180,0.000000,0.309620,0.000000,0.273385,0.000000
3,8/10/19,Sicko Mode,Travis Scott,Sicko ModeTravis Scott,1.0,52.0,['rap'],ASTROWORLD,True,312820.0,...,0.336673,0.308320,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.350175,0.000000
4,8/10/19,High Hopes,Panic! At The Disco,High HopesPanic! At The Disco,4.0,52.0,"['baroque pop', 'emo', 'modern rock']",Pray for the Wicked,False,190946.0,...,0.736585,0.000000,0.000000,0.000000,0.000000,0.254216,0.000000,0.000000,0.000000,0.000000
5,8/24/19,Happier,Marshmello & Bastille,HappierMarshmello & Bastille,2.0,52.0,"['brostep', 'progressive electro house']",Happier,False,214289.0,...,0.245761,0.000000,0.000000,0.000000,0.000000,0.000000,0.737906,0.000000,0.000000,0.000000
6,8/12/17,Closer,The Chainsmokers Featuring Halsey,CloserThe Chainsmokers Featuring Halsey,1.0,52.0,"['electropop', 'pop', 'tropical house']",Closer,False,244960.0,...,0.072326,0.000000,0.000000,0.000000,0.335226,0.000000,0.557059,0.000000,0.029001,0.000000
7,2/25/17,Don't Let Me Down,The Chainsmokers Featuring Daya,Don't Let Me DownThe Chainsmokers Featuring Daya,3.0,52.0,"['electropop', 'pop', 'tropical house']",Don't Let Me Down,False,208373.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.939974,0.000000,0.000000
8,3/19/16,See You Again,Wiz Khalifa Featuring Charlie Puth,See You AgainWiz Khalifa Featuring Charlie Puth,1.0,52.0,"['hip hop', 'pittsburgh rap', 'pop rap', 'rap'...",See You Again (feat. Charlie Puth),False,229525.0,...,0.244157,0.000000,0.000000,0.740451,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,5/4/19,Better Now,Post Malone,Better NowPost Malone,3.0,52.0,"['dfw rap', 'melodic rap', 'rap']",beerbongs & bentleys,True,231266.0,...,0.010689,0.000000,0.847574,0.000000,0.133590,0.000000,0.000000,0.000000,0.000000,0.000000


# Random Forest Classification

In [915]:
songset.dtypes

Week                          object
Song                          object
Performer                     object
SongID                        object
Peak Position                float64
Weeks on Chart               float64
spotify_genre                 object
spotify_track_album           object
spotify_track_explicit        object
spotify_track_duration_ms    float64
spotify_track_popularity     float64
danceability                 float64
energy                       float64
key                          float64
loudness                     float64
mode                         float64
speechiness                  float64
acousticness                 float64
instrumentalness             float64
liveness                     float64
valence                      float64
tempo                        float64
time_signature               float64
Debut Position               float64
Lyrics                        object
LyricPositive                  int64
LyricNeutral                   int64
L

In [936]:
featureset = songset

In [937]:
featureset = featureset.drop(columns=['Week', 'SongID', 'Song','Performer', 'Tweets', 'spotify_genre', 'spotify_track_album'])

In [938]:
featureset['Classification'] = 'Top 100'

In [919]:
# Populates Classes for Ranks
for idx, values in featureset.iterrows():
    text = 'Top 100'
    if values['Peak Position'] == 1:
        text = 'Top 1'
    elif values['Peak Position'] < 6:
        text = 'Top 5'
    elif values['Peak Position'] < 16:
        text = 'Top 15'
    elif values['Peak Position'] < 31:
        text = 'Top 30'
    elif values['Peak Position'] < 51:
        text = 'Top 50'
    elif values['Peak Position'] < 76:
        text = 'Top 75'
    else:
        text = 'Top 100'
    featureset['Classification'][idx] = text

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [921]:
featureset['Classification'].value_counts() 

Top 100    600
Top 75     516
Top 50     355
Top 30     216
Top 15     184
Top 5       73
Top 1       40
Name: Classification, dtype: int64

In [939]:
for idx, values in featureset.iterrows():
    number = 10
    if values['Peak Position'] == 1:
        number = 0
    elif values['Peak Position'] < 6:
        number = 1
    elif values['Peak Position'] < 16:
        number = 2
    elif values['Peak Position'] < 31:
        number = 3
    elif values['Peak Position'] < 51:
        number = 4
    elif values['Peak Position'] < 76:
        number = 5
    else:
        number = 6
    featureset['Classification'][idx] = number

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [940]:
featureset = featureset.drop(columns=['Peak Position', 'Lyrics'])
featureset

Unnamed: 0,Weeks on Chart,spotify_track_explicit,spotify_track_duration_ms,spotify_track_popularity,danceability,energy,key,loudness,mode,speechiness,...,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 0,Classification
0,56.0,False,263400.0,88.0,0.599,0.448,8.0,-6.312,1.0,0.0232,...,0.000000,0.000000,0.000000,0.422254,0.172768,0.057950,0.000000,0.000000,0.000000,0
1,52.0,True,201660.0,89.0,0.752,0.488,6.0,-7.050,1.0,0.0705,...,0.819213,0.000000,0.000000,0.000000,0.165687,0.000000,0.000000,0.000000,0.000000,0
2,52.0,False,211466.0,86.0,0.358,0.557,10.0,-7.398,1.0,0.0590,...,0.000000,0.000000,0.000000,0.126180,0.000000,0.309620,0.000000,0.273385,0.000000,2
3,52.0,True,312820.0,88.0,0.834,0.730,8.0,-3.714,1.0,0.2220,...,0.308320,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.350175,0.000000,0
4,52.0,False,190946.0,87.0,0.579,0.904,5.0,-2.729,1.0,0.0618,...,0.000000,0.000000,0.000000,0.000000,0.254216,0.000000,0.000000,0.000000,0.000000,1
5,52.0,False,214289.0,90.0,0.687,0.792,5.0,-2.749,1.0,0.0452,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.737906,0.000000,0.000000,0.000000,1
6,52.0,False,244960.0,86.0,0.748,0.524,8.0,-5.599,1.0,0.0338,...,0.000000,0.000000,0.000000,0.335226,0.000000,0.557059,0.000000,0.029001,0.000000,0
7,52.0,False,208373.0,81.0,0.532,0.869,11.0,-5.094,1.0,0.1720,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.939974,0.000000,0.000000,1
8,52.0,False,229525.0,82.0,0.689,0.481,10.0,-7.503,1.0,0.0815,...,0.000000,0.000000,0.740451,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
9,52.0,True,231266.0,88.0,0.680,0.563,10.0,-5.843,1.0,0.0454,...,0.000000,0.847574,0.000000,0.133590,0.000000,0.000000,0.000000,0.000000,0.000000,1


In [1229]:
featureset = pd.read_csv("Data/Features.csv")
featureset2 = pd.read_csv("Data/FeaturesLimited.csv")

In [1230]:
# featureset = featureset.drop(columns=['Unnamed: 0'])
featureset2 = featureset2.drop(columns=['Unnamed: 0'])

In [1231]:
featureset2.dtypes

Weeks on Chart                 int64
spotify_track_explicit          bool
spotify_track_duration_ms      int64
spotify_track_popularity       int64
danceability                 float64
energy                       float64
key                          float64
loudness                     float64
mode                         float64
speechiness                  float64
acousticness                 float64
instrumentalness             float64
liveness                     float64
valence                      float64
tempo                        float64
time_signature               float64
Debut Position                 int64
LyricPositive                  int64
LyricNeutral                   int64
LyricNegative                  int64
TweetPositive                  int64
TweetNeutral                   int64
TweetNegative                  int64
Topic 1                      float64
Topic 2                      float64
Topic 3                      float64
Topic 4                      float64
T

In [972]:
featureset.to_csv("Data/Features.csv")

In [1232]:
# reading the data
# df=spark.createDataFrame(songset.astype(str))
df=spark.createDataFrame(featureset2)

In [1233]:
print("Total number of rows: %d" % df.count())

Total number of rows: 1984


In [1234]:
RANDOM_SEED = 13579
transformed_df = df.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

splits = [0.8, 1.0 - 0.8]
training_data, test_data = transformed_df.randomSplit(splits, RANDOM_SEED)

In [1235]:
# Training the Classifier
rf = RandomForest.trainClassifier(training_data, numClasses=7, categoricalFeaturesInfo={}, \
    numTrees=25, featureSubsetStrategy="auto", impurity="gini", \
    maxDepth=10, maxBins= 32, seed=RANDOM_SEED)

predictions = rf.predict(test_data.map(lambda x: x.features))
labels_and_predictions = test_data.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(test_data.count())
print("Model accuracy: %.3f%%" % (acc * 100))

Model accuracy: 67.146%


In [1209]:
# 
metrics = BinaryClassificationMetrics(labels_and_predictions)
print("Area under Precision/Recall (PR) curve: %.f" % (metrics.areaUnderPR))
print("Area under Receiver Operating Characteristic (ROC) curve: %.3f" % (metrics.areaUnderROC))

Area under Precision/Recall (PR) curve: 1
Area under Receiver Operating Characteristic (ROC) curve: 0.990


In [1195]:
labels_and_predictions.take(20)

[(0.0, 1.0),
 (1.0, 2.0),
 (1.0, 2.0),
 (0.0, 2.0),
 (1.0, 1.0),
 (1.0, 2.0),
 (0.0, 2.0),
 (2.0, 2.0),
 (2.0, 2.0),
 (0.0, 2.0),
 (1.0, 1.0),
 (0.0, 2.0),
 (0.0, 0.0),
 (0.0, 4.0),
 (3.0, 2.0),
 (1.0, 1.0),
 (1.0, 3.0),
 (2.0, 2.0),
 (2.0, 2.0),
 (2.0, 1.0)]