# Sentiment Analysis Notebook

#### This notebook contains a demonstration of the tools necessary for conducting descriptive analysis of the data. This includes things such as frequency analysis, descripting statistics and temporal frequency.

In [None]:
import trt_API.process as proc
import trt_API.analysis as ana
import trt_API.sentiment as sent

import datetime
SORTED = False

## Variables for Analysis

In [None]:
# Set the path to the parent directory containing all Tweets of interest
DIRECTORY = './tweets/*'
# Set to True to isolate english language tweets
ENGLISH = False

## Load Tweet and Generate Dataframe

In [None]:
tweet_objects = proc.loadTweetObjects(DIRECTORY)
df = proc.convertTweetsToDataframe(tweet_objects, ENGLISH)

## Extract Potential Cashtags

In [None]:
ctdf = proc.extractPossibleCashtags(df)

## Removing Noisy Tweets

In [None]:
'''
*** Tweets often use popular hashtags with unrelated topics.
*** Noisy words can be identified to use to filter such tweets.
*** Enter these words below in the noisy_terms list.
'''
noisy_terms = []
cldf = proc.removeNoisyTerms(df, noisy_terms)

## Remove Retweets

In [None]:
cldf_no_RT, cldf = proc.removeRetweets(cldf)

In [None]:
print(cldf_no_RT.shape[0])

## SPLITTING DATA

### By date

In [None]:
cldf, SORTED = ana.sortByDate(cldf,SORTED,PRINT_TOP=False,TOP=10)
begin = datetime.datetime(2018,12,31,23,59) #year, month, day, hour, minute
end = datetime.datetime(2019,2,24,23,59)
specific_range_cldf = cldf[cldf.date > begin]
specific_range_cldf = specific_range_cldf[specific_range_cldf.date < end]

### By term

In [None]:
terms_of_interest = ['beale street','bealestreet']
more_terms_of_interest = ['black panther','blackpanther']
bsdf = proc.findTerms(cldf, terms_of_interest)
bpdf = proc.findTerms(cldf, more_terms_of_interest)

## Convert Tweets to List

In [None]:
cldf['original_tweet'][cldf.original_tweet == 'None'] = cldf['tweet']
cldf['tweet'] = cldf['original_tweet']
del cldf['original_tweet']
print(cldf.head())
tweets = list(cldf.tweet)

## Sentiment

### Total Sentiment

In [None]:
'''
*** First we can look at overall sentiment.
*** Here we will look at a histogram and a binary comparison (ignoring neutral).
'''
srange = (-8,10) # sentiment range (for plot)
sent.plotTotalSentiment(tweets,srange)

### Hashtag & Username Raw Sentiment

In [None]:
NORMALIZED = False
PLOT = True

#### Hashtags

In [None]:
'''
*** Here we compute sentiment for each hashtag of interest.
'''
hashtags = []
raw_htag_sent = sent.computeHashtagSentiment(tweets,hashtags,NORMALIZED,PLOT)

#### Usernames

In [None]:
'''
*** Here we compute sentiment for each username of interest.
'''
usernames = []
raw_user_sent = sent.computeHashtagSentiment(tweets,usernames,NORMALIZED,PLOT)

### Hashtag & Username Normalized Sentiment

In [None]:
NORMALIZED = True

#### Hashtags

In [None]:
'''
*** Here we compute sentiment for each hashtag of interest.
'''
hashtags = []
norm_htag_sent = sent.computeHashtagSentiment(tweets,hashtags,NORMALIZED,PLOT)

#### Usernames

In [None]:
'''
*** Here we compute sentiment for each username of interest.
'''
usernames = []
norm_user_sent = sent.computeHashtagSentiment(tweets,usernames,NORMALIZED,PLOT)

## View Sentiment over Time

In [None]:
cldf = sent.computeDataframeSentiment(cldf)

In [None]:
cldf.sentiment.groupby([cldf['date'].dt.year,cldf["date"].dt.month]).mean().plot(kind="bar")

### Subset Sentiment by Date

In [None]:
specific_range_cldf.sentiment.groupby([specific_range_cldf['date'].dt.month,specific_range_cldf['date'].dt.week]).mean().plot(kind='bar')

## Compare Sentiment from Different Dataframes

In [None]:
df_list = [bsdf,bpdf]
normalized = True
labels = ['1','2']
sent.compareSentimentByDataframe(df_list,normalized,labels)