# Sentiment analysis

First we'll start with processing the data and installing all needed modules so that sentiment analysis can be applied.

In [2]:
!pip install NRCLex

Collecting NRCLex
  Downloading NRCLex-3.0.0.tar.gz (396 kB)
[K     |████████████████████████████████| 396 kB 9.1 MB/s eta 0:00:01
[?25hCollecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[K     |████████████████████████████████| 636 kB 30.2 MB/s eta 0:00:01
Building wheels for collected packages: NRCLex
  Building wheel for NRCLex (setup.py) ... [?25ldone
[?25h  Created wheel for NRCLex: filename=NRCLex-3.0.0-py3-none-any.whl size=43328 sha256=1c197fccb3629572ea1c24d9e3c34ee4e7db73c132fae77f0fe68b15a9994269
  Stored in directory: /Users/clairek/Library/Caches/pip/wheels/68/c4/f2/c390dd3eac398fdf45f7a01c6516bc53fa7a9ab59c7d2ff518
Successfully built NRCLex
Installing collected packages: textblob, NRCLex
Successfully installed NRCLex-3.0.0 textblob-0.17.1


In [4]:
### Import required modules
from nrclex import NRCLex
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
my_stop_words = STOPWORDS.union(set(['yeah', 'cause', 'wanna', 'gonna', 'nigga', 'fuckin', 'bitch', 'come'])) #extra stopwords 

[nltk_data] Downloading package wordnet to /Users/clairek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
import sklearn
from sklearn.datasets import load_files
import pandas as pd
import glob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string 
import operator

### Load dataset, adapt the path to point to your local copy of the dataset
path = "/Users/clairek/Desktop/Year 3/ba-text-mining-master/project/archive/song_info_csv/"
all_files = glob.glob(path + "*.csv")

### Instert dataset into dataframe
li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)  

### Take out only the lyrics from the full dataframe
df = pd.concat(li, axis=0, ignore_index=True)
data_text = df[['Lyric']]

data_text['index'] = data_text.index
documents = data_text

### Take out any empty lyrics rows 
df = pd.concat(li, axis=0, ignore_index=True)
df_new = df[df['Lyric'].notnull()]
df_new.loc[:, ~df_new.columns.isin(['Date', 'Year', 'Unnamed: 0', 'Album'])]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_text['index'] = data_text.index


Unnamed: 0,Artist,Title,Lyric
0,Dua Lipa,New Rules,one one one one one talkin' in my sleep at n...
1,Dua Lipa,Don’t Start Now,if you don't wanna see me did a full 80 craz...
2,Dua Lipa,IDGAF,you call me all friendly tellin' me how much y...
3,Dua Lipa,Blow Your Mind (Mwah),i know it's hot i know we've got something tha...
4,Dua Lipa,Be the One,i see the moon i see the moon i see the moon o...
...,...,...,...
5400,Khalid,Khalid - Vertigo (Tradução Português),será que é melhor apenas acreditar nas teorias...
5401,Khalid,Better (Miles Away Remix),i'm not really drunk i never get that fucked u...
5402,Khalid,Khalid - Better (Official Music Video),users considering it's a virus or malware must...
5403,Khalid,Perfect Lover,lyrics for this song have yet to be released p...


In [10]:
def lemmatize_stemming(text):
    return lemmatizer.lemmatize(text)
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in my_stop_words and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [12]:
### Make a list from the dataframe colummn
lyrics_list = df_new['Lyric'].values.tolist()

In [13]:
def get_emotions(text, dataframe):
    """
    Function that will take a text at return a dataframe of emotions (excluding positive and negative)
    Params:
        text (str): The song of interest
        dataframe (Pandas DataFrame): The emotions dataframe to add values
        
    Return:
        Updated dataframe with the new row of interest
    """
    nrc_text = NRCLex(text)
    nrc_dict = nrc_text.affect_frequencies
    nrc_dict.pop('positive', None)
    nrc_dict.pop('negative', None)
    nrc_dict.pop('anticip', None)
    nrc_max_emotion = max(nrc_dict.items(), key=operator.itemgetter(1))[0]
    nrc_dict["max_emotion"] = nrc_max_emotion
    dataframe = dataframe.append(nrc_dict, ignore_index= True)
    return dataframe

In [14]:
### Make a dataframe of the different emotions
emotion_df = pd.DataFrame(columns = ['fear', 'anger', 'trust', 'surprise', 'sadness', 'disgust', 'joy', 'anticipation', 'max_emotion'])

### Run lyrics through NRC to assign emotions
for text in lyrics_list:
    emotion_df = get_emotions(text, emotion_df)

In [15]:
### Make a list of all the emotions
label_gold = []

for emotion in emotion_df['max_emotion']:
    label_gold.append(emotion)

In [16]:
import pathlib
import sklearn
import numpy
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### Initialize lyric object, and then turn lyric train data into a vector 
lyric_vec = CountVectorizer(min_df=0, # If a token appears fewer times than this, across all documents, it will be ignored
                             tokenizer=nltk.word_tokenize, # we use the nltk tokenizer
                             stop_words=stopwords.words('english')) # stopwords are removed

lyric_counts = lyric_vec.fit_transform(lyrics_list)

### Convert raw frequency counts into TF-IDF values
tfidf_transformer = TfidfTransformer()
song_tfidf = tfidf_transformer.fit_transform(lyric_counts)

docs_train, docs_test, y_train, y_test = train_test_split(
    song_tfidf, # the tf-idf model
    label_gold, # the category values for each tweet 
    test_size = 0.20 # we use 80% for training and 20% for development
    ) 

### Train a multinomial Naive Bayes classifier
clf = MultinomialNB().fit(docs_train, y_train)

### Predicting the Test set results, find macro recall
y_pred = clf.predict(docs_test)

sklearn.metrics.recall_score(y_true=y_test,
                             y_pred=y_pred,
                             average='macro') 

print(classification_report(y_pred, y_test))



              precision    recall  f1-score   support

       anger       0.02      1.00      0.03         2
anticipation       0.01      1.00      0.02         1
     disgust       0.00      0.00      0.00         0
        fear       0.36      0.75      0.48       114
         joy       0.99      0.31      0.48       954
     sadness       0.05      1.00      0.09         5
    surprise       0.00      0.00      0.00         0
       trust       0.00      0.00      0.00         0

    accuracy                           0.36      1076
   macro avg       0.18      0.51      0.14      1076
weighted avg       0.92      0.36      0.47      1076



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
