# Satish Agrawal
## Bellevue University
### DSC 550

In [1]:
# import required libraries
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

## Load the data

In [2]:
# Load the data file DailyComments.csv from the Week 4 Data Files into a data frame.
file_path='week-4/DailyComments.csv'
dataframe = pd.read_csv(file_path)
print(dataframe)

  Day of Week                                        comments
0      Monday                             Hello, how are you?
1     Tuesday                            Today is a good day!
2   Wednesday  It's my birthday so it's a really special day!
3    Thursday       Today is neither a good day or a bad day!
4      Friday                           I'm having a bad day.
5    Saturday       There' s nothing special happening today.
6      Sunday                      Today is a SUPER good day!


## Results using scheme provided

In [3]:
# vectorize the comments from dataframe
corpus = dataframe['comments']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print("")
print("vectorized words")
print("")
print(vectorizer.get_feature_names())
print("")
print("Identify Feature Words - Matrix View")
print("")
print( X.toarray())


vectorized words

['are', 'bad', 'birthday', 'day', 'good', 'happening', 'having', 'hello', 'how', 'is', 'it', 'my', 'neither', 'nothing', 'or', 'really', 'so', 'special', 'super', 'there', 'today', 'you']

Identify Feature Words - Matrix View

[[1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 1 1 0 0 0 0 0 0 2 1 0 0 0 1 1 1 0 0 0 0]
 [0 1 0 2 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0]
 [0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0]
 [0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0]]


In [17]:
# trying out the example provided

df = pd.DataFrame({'text' : corpus})

#check for positive words and negative words
df['positive1'] = df.text.str.count('good')
df['positive2']= df.text.str.count('special')
df['negative'] = df.text.str.count('bad')
df['TotScore'] = df.positive1 + df.positive2 - df.negative

print("")
print(df)

Z = sum(df['TotScore'])
print("")
print("Overall Score:  ",Z)


                                             text  positive1  positive2  \
0                             Hello, how are you?          0          0   
1                            Today is a good day!          1          0   
2  It's my birthday so it's a really special day!          0          1   
3       Today is neither a good day or a bad day!          1          0   
4                           I'm having a bad day.          0          0   
5       There' s nothing special happening today.          0          1   
6                      Today is a SUPER good day!          1          0   

   negative  TotScore  
0         0         0  
1         0         1  
2         0         1  
3         1         0  
4         1        -1  
5         0         1  
6         0         1  

Overall Score:   3


## Analysis using NLTK VADER

In [5]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\SAgrawal\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
df1 = pd.DataFrame({'text' : corpus})

df1['scores'] = df1['text'].apply(lambda t: sid.polarity_scores(t))

df1['compound']  = df1['scores'].apply(lambda score_dict: score_dict['compound'])

In [7]:
df1

Unnamed: 0,text,scores,compound
0,"Hello, how are you?","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
1,Today is a good day!,"{'neg': 0.0, 'neu': 0.484, 'pos': 0.516, 'comp...",0.4926
2,It's my birthday so it's a really special day!,"{'neg': 0.0, 'neu': 0.664, 'pos': 0.336, 'comp...",0.5497
3,Today is neither a good day or a bad day!,"{'neg': 0.508, 'neu': 0.492, 'pos': 0.0, 'comp...",-0.735
4,I'm having a bad day.,"{'neg': 0.538, 'neu': 0.462, 'pos': 0.0, 'comp...",-0.5423
5,There' s nothing special happening today.,"{'neg': 0.361, 'neu': 0.639, 'pos': 0.0, 'comp...",-0.3089
6,Today is a SUPER good day!,"{'neg': 0.0, 'neu': 0.277, 'pos': 0.723, 'comp...",0.8327


VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically tuned to sentiments expressed in social media platforms.It works good for the daily comments files provided. 

## Using NLTK movie reviews to train and then run on daily comments
#### I tried this multiple times but it looks like all the comments are being cosidered negative if we use the model trained movie reivews.

In [8]:
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')
from nltk.corpus import movie_reviews

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\SAgrawal\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SAgrawal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
# Document is a list of (words of review, category of review)
document = [(movie_reviews.words(file_id),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]
document

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg'),
 (['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...], 'neg'),
 (['so', 'ask', 'yourself', 'what', '"', '8mm', '"', '(', ...], 'neg'),
 (['that', "'", 's', 'exactly', 'how', 'long', 'the', ...], 'neg'),
 (['call', 'it', 'a', 'road', 'trip', 'for', 'the', ...], 'neg'),
 (['plot', ':', 'a', 'young', 'french', 'boy', 'sees', ...], 'neg'),
 (['best', 'remembered', 'for', 'his', 'understated', ...], 'neg'),
 (['janeane', 'garofalo', 'in', 'a', 'romantic', ...], 'neg'),
 (['and', 'now', 'the', 'high', '-', 'flying', 'hong', ...], 'neg'),
 (['a', 'movie', 'like', 'mortal', 'kombat', ':', ...], 'neg'),
 (['she', 'was', 'the', 'femme', 'in', 

In [10]:
# all_words is a dictionary which contains the frequency of words in ‘movie_reviews’
all_words = nltk.FreqDist(movie_reviews.words())

In [11]:
# Defining the feature_vector
feature_vector = list(all_words)
len(feature_vector)

39768

In [12]:
# we define a function that finds the features
def find_feature(word_list):
    return {x: True for x in word_list}

def find_feature2(word_list):
    # Initialization
    feature = {}
    # For loop to find the feature. ‘True’ is assigned if word in feature_vector can also be found in review. Otherwise ‘False’
    for x in feature_vector:
        feature[x] = x in word_list
    return feature

In [13]:
# Feature_sets stores the ‘feature’ of every review
# Below step take several hours ro finish. I had to let it run overnight
feature_sets = [(find_feature2(word_list),category) for (word_list,category) in document]

In [14]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn import model_selection

model = SklearnClassifier(SVC(kernel = 'linear'))
model.train(feature_sets)

<SklearnClassifier(SVC(kernel='linear'))>

In [15]:
df2 = pd.DataFrame({'text' : corpus})
scores = pd.Series()
for text in df2.text:
    print(model.classify(find_feature2(text)))

scores

  scores = pd.Series()


neg
neg
neg
neg
neg
neg
neg


Series([], dtype: float64)