# gyan kannur
## DSC550, Week 4

# 4.2: Sentiment Analysis

In [31]:
import pandas as pd

## 1) Load the data file DailyComments.csv from the Week 4 Data Files into a data frame.

In [32]:
commentsDF = pd.read_csv('./assignments-data/DailyComments.csv')

## 2) Identify a scheme to categorize each comment as positive or negative. You can devise your own scheme or find a commonly used scheme to perform this sentiment analysis. However you decide to do this, make sure to explain the scheme you decide to use.

### Manual Sentiment Encoding

In [33]:
# Assign sentiment for each comment based on my own judgement
myScoring = ['Neutral', 'Positive', 'Positive', 'Neutral', 'Negative', 'Neutral', 'Positive']
commentsDF['mySentiment'] = myScoring
commentsDF

Unnamed: 0,Day of Week,comments,mySentiment
0,Monday,"Hello, how are you?",Neutral
1,Tuesday,Today is a good day!,Positive
2,Wednesday,It's my birthday so it's a really special day!,Positive
3,Thursday,Today is neither a good day or a bad day!,Neutral
4,Friday,I'm having a bad day.,Negative
5,Saturday,There' s nothing special happening today.,Neutral
6,Sunday,Today is a SUPER good day!,Positive


### Sentiment Analysis by Encoding Selected Words as Positive or Negative

In [34]:
# Create a copy of the data to work with
encodingDF = commentsDF.copy()

In [35]:
def manualEncoding(df):
    '''
    Look for specific words manually assigned as positive or negative,
    creating a matrix showing which comments contain those words.
    '''
    df['positive1'] = df.comments.str.count('good')
    df['positive2'] = df.comments.str.count('special')
    df['negative'] = df.comments.str.count('bad')
    # Subtract negative words from positive words for an overall score
    df['encoderScoring'] = df.positive1 + df.positive2 - df.negative
    return df

In [36]:
manualEncoding(encodingDF)

Unnamed: 0,Day of Week,comments,mySentiment,positive1,positive2,negative,encoderScoring
0,Monday,"Hello, how are you?",Neutral,0,0,0,0
1,Tuesday,Today is a good day!,Positive,1,0,0,1
2,Wednesday,It's my birthday so it's a really special day!,Positive,0,1,0,1
3,Thursday,Today is neither a good day or a bad day!,Neutral,1,0,1,0
4,Friday,I'm having a bad day.,Negative,0,0,1,-1
5,Saturday,There' s nothing special happening today.,Neutral,0,1,0,1
6,Sunday,Today is a SUPER good day!,Positive,1,0,0,1


### Sentiment Analysis using Vader

In [47]:
# import nltk
# nltk.download('vader_lexicon')

In [48]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [49]:
# Set up Vader sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [50]:
# Create a copy of the data to work with
vaderDF = commentsDF.copy()

In [51]:
def vaderScoring(df):
    '''
    Analyzes comments for overall negative, neutral, and positive words
    as well as an overall Vader sentiment score. Assigns each value to
    its respective column and returns the DataFrame.
    '''
    df['negative'] = df['comments'].apply(lambda x: analyzer.polarity_scores(x)['neg'])
    df['neutral'] = df['comments'].apply(lambda x: analyzer.polarity_scores(x)['neu'])
    df['positive'] = df['comments'].apply(lambda x: analyzer.polarity_scores(x)['pos'])
    df['vaderScoring'] = df['comments'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
    return df

In [52]:
vaderScoring(vaderDF)

Unnamed: 0,Day of Week,comments,mySentiment,negative,neutral,positive,vaderScoring
0,Monday,"Hello, how are you?",Neutral,0.0,1.0,0.0,0.0
1,Tuesday,Today is a good day!,Positive,0.0,0.484,0.516,0.4926
2,Wednesday,It's my birthday so it's a really special day!,Positive,0.0,0.664,0.336,0.5497
3,Thursday,Today is neither a good day or a bad day!,Neutral,0.508,0.492,0.0,-0.735
4,Friday,I'm having a bad day.,Negative,0.538,0.462,0.0,-0.5423
5,Saturday,There' s nothing special happening today.,Neutral,0.361,0.639,0.0,-0.3089
6,Sunday,Today is a SUPER good day!,Positive,0.0,0.277,0.723,0.8327


### Sentiment Analysis using TextBlob

In [53]:
from textblob import TextBlob

In [54]:
# Create a copy of the data to work with
textBlobDF = commentsDF.copy()

In [55]:
# Create a column with TextBlob sentiment scores for each comment
textBlobDF['textBlobScoring'] = textBlobDF['comments'].apply(lambda x: TextBlob(x).polarity)
textBlobDF

Unnamed: 0,Day of Week,comments,mySentiment,textBlobScoring
0,Monday,"Hello, how are you?",Neutral,0.0
1,Tuesday,Today is a good day!,Positive,0.875
2,Wednesday,It's my birthday so it's a really special day!,Positive,0.446429
3,Thursday,Today is neither a good day or a bad day!,Neutral,-0.0875
4,Friday,I'm having a bad day.,Negative,-0.7
5,Saturday,There' s nothing special happening today.,Neutral,0.357143
6,Sunday,Today is a SUPER good day!,Positive,0.604167


## 3) Implement your sentiment analysis with code and display the results. Note: DailyComments.csv is a purposely small file, so you will be able to clearly see why the results are what they are.

Inspecting the results for each Sentiment Analysis, it would appear that the +/- 0.4 should suffice to give us the most accurate results across the board.

In [58]:
import numpy as np

In [59]:
def overallSentiment(value):
    '''
    Returns a categorical sentiment value of "Positive", "Negative", 
    or "Neutral" based on the provided sentiment score.
    '''
    if value >= 0.4:
        return 'Positive'
    elif value <= -0.4:
        return 'Negative'
    else:
        return 'Neutral'
    
def sentimentCol(df, column):
    '''
    Creates a column for and assigns a sentiment value,
    based on overall sentiment score for each comment.
    '''
    df['Sentiment'] = df[column].apply(lambda score: overallSentiment(score))
    return df

def accuracy(df):
    '''
    Assigns a score of 1 for each Sentiment that matches my assigned Sentiment.
    Averages the scores and returns as a percentage correct for accuracy.
    '''
    score = np.mean(np.where(df['mySentiment'] == df['Sentiment'], 1, 0))
    return score.round(2) * 100

### Manual Word Encoding Results

In [60]:
sentimentCol(encodingDF, 'encoderScoring')
print(f'Overall accuracy: {accuracy(encodingDF)}%')
encodingDF

Overall accuracy: 86.0%


Unnamed: 0,Day of Week,comments,mySentiment,positive1,positive2,negative,encoderScoring,Sentiment
0,Monday,"Hello, how are you?",Neutral,0,0,0,0,Neutral
1,Tuesday,Today is a good day!,Positive,1,0,0,1,Positive
2,Wednesday,It's my birthday so it's a really special day!,Positive,0,1,0,1,Positive
3,Thursday,Today is neither a good day or a bad day!,Neutral,1,0,1,0,Neutral
4,Friday,I'm having a bad day.,Negative,0,0,1,-1,Negative
5,Saturday,There' s nothing special happening today.,Neutral,0,1,0,1,Positive
6,Sunday,Today is a SUPER good day!,Positive,1,0,0,1,Positive


### Vader Analysis Results

In [61]:
sentimentCol(vaderDF, 'vaderScoring')
print(f'Overall accuracy: {accuracy(vaderDF)}%')
vaderDF

Overall accuracy: 86.0%


Unnamed: 0,Day of Week,comments,mySentiment,negative,neutral,positive,vaderScoring,Sentiment
0,Monday,"Hello, how are you?",Neutral,0.0,1.0,0.0,0.0,Neutral
1,Tuesday,Today is a good day!,Positive,0.0,0.484,0.516,0.4926,Positive
2,Wednesday,It's my birthday so it's a really special day!,Positive,0.0,0.664,0.336,0.5497,Positive
3,Thursday,Today is neither a good day or a bad day!,Neutral,0.508,0.492,0.0,-0.735,Negative
4,Friday,I'm having a bad day.,Negative,0.538,0.462,0.0,-0.5423,Negative
5,Saturday,There' s nothing special happening today.,Neutral,0.361,0.639,0.0,-0.3089,Neutral
6,Sunday,Today is a SUPER good day!,Positive,0.0,0.277,0.723,0.8327,Positive


### TextBlob Analysis Results

In [62]:
sentimentCol(textBlobDF, 'textBlobScoring')
print(f'Overall accuracy: {accuracy(textBlobDF)}%')
textBlobDF

Overall accuracy: 100.0%


Unnamed: 0,Day of Week,comments,mySentiment,textBlobScoring,Sentiment
0,Monday,"Hello, how are you?",Neutral,0.0,Neutral
1,Tuesday,Today is a good day!,Positive,0.875,Positive
2,Wednesday,It's my birthday so it's a really special day!,Positive,0.446429,Positive
3,Thursday,Today is neither a good day or a bad day!,Neutral,-0.0875,Neutral
4,Friday,I'm having a bad day.,Negative,-0.7,Negative
5,Saturday,There' s nothing special happening today.,Neutral,0.357143,Neutral
6,Sunday,Today is a SUPER good day!,Positive,0.604167,Positive


## 4) For up to 5% extra credit, find another set of comments, e.g., some tweets, and perform the same sentiment analysis.

In [67]:
# Load a saved 15-comment subset of the Week 2 'Controversial Comments' data set
newCommentsDF = pd.read_csv('./assignments-data/comments_subset.csv', header=1, names=['con', 'comments'])

In [68]:
# Assign TextBlob scores for each comment
newCommentsDF['textBlobScoring'] = newCommentsDF['comments'].apply(lambda x: TextBlob(x).polarity)
# Assign categorical Sentiments based on TextBlob scores
sentimentCol(newCommentsDF, 'textBlobScoring')
newCommentsDF

Unnamed: 0,con,comments,textBlobScoring,Sentiment
0,0,"didn't this guy start the hunt for WMDs, that ...",0.0,Neutral
1,0,I remembered when he said that at the debate. ...,0.299167,Neutral
2,0,"Sorry, but the media isn't the reason there's ...",-0.366667,Neutral
3,0,Censor everything I don't like!\n\nOnly leftis...,-0.025,Neutral
4,0,I think it's the pert where that's not true th...,0.24375,Neutral
5,0,"Oh, I get three guesses? That's great! I bet i...",0.4,Positive
6,0,Some of you people might have to get actual jo...,0.0,Neutral
7,0,"Oops, sorry. Thought they said stein for some...",-0.5,Negative
8,0,Maybe he was afraid of a Trump presidency targ...,-0.6,Negative
9,0,What is the false argument?,-0.4,Negative


Only being able to see the first part of each tweet, it is difficult to determine the accuracy of each Sentiment assignment, but I'm not seeing very many positive, which I think would make sense in a "Controversial Comments" data set.