# Descriptive Analysis Notebook

#### This notebook contains a demonstration of the tools necessary for conducting descriptive analysis of the data. This includes things such as frequency analysis, descripting statistics and temporal frequency.

In [None]:
import trt_API.process as proc
import trt_API.analysis as ana
import pandas as pd
import matplotlib.pyplot as plt
SORTED = False

## Variables for Analysis

In [None]:
# Set the path to the parent directory containing all Tweets of interest
DIRECTORY = './tweets/*'
# Set to True to isolate english language tweets
ENGLISH = False

## Load Tweet and Generate Dataframe

In [None]:
tweet_objects = proc.loadTweetObjects(DIRECTORY)
encoding = True #Use False on Macs for better results
df = proc.convertTweetsToDataframe(tweet_objects, ENGLISH, encoding)

## Extract Potential Cashtags

In [None]:
ctdf = proc.extractPossibleCashtags(df)

## Removing Noisy Tweets

In [None]:
'''
*** Tweets often use popular hashtags with unrelated topics.
*** Noisy words can be identified to use to filter such tweets.
*** Enter these words below in the noisy_terms list.
'''
noisy_terms = []
cldf = proc.removeNoisyTerms(df, noisy_terms)

## Remove Retweets

In [None]:
cldf_no_RT, cldf = proc.removeRetweets(cldf)

In [None]:
print(cldf_no_RT.shape[0])

## Convert Tweets to List

In [None]:
cldf['original_tweet'][cldf.original_tweet == 'None'] = cldf['tweet']
cldf['tweet'] = cldf['original_tweet']
del cldf['original_tweet']
print(cldf.head())
tweets = list(cldf.tweet)

## Hashtag Frequencies

In [None]:
hashtags = []

In [None]:
ana.countItems(hashtags,tweets,PLOT=True,PRINT=False)

In [None]:
usernames = []

In [None]:
ana.countItems(usernames,tweets,PLOT=True,PRINT=False)

## Sort By Date

In [None]:
sorted_cldf, SORTED = ana.sortByDate(cldf,SORTED,PRINT_TOP=False,TOP=10)

In [None]:
cldf.head()

## Plot by Month

In [None]:
cldf.tweet.groupby([cldf['date'].dt.year,cldf["date"].dt.month]).count().plot(kind="bar")

## Sort by Number of Followers

In [None]:
TOP_N = 20 ## USE THIS VARIABLE TO SET THE NUMBER OF USERS TO ANALYZE
cldf.followers = df.followers.astype(int)
cldf.sort_values(by='followers',ascending=False,inplace=True)
cldf.head(TOP_N)

In [None]:
plt.bar(cldf.iloc[:TOP_N].username.astype(str),cldf.iloc[:TOP_N].followers.astype(int))
plt.xticks(rotation='vertical')
plt.show()

## Top Hashtags

In [None]:
TOP_N = 20 ## USE THIS NUMBER TO SET THE NUMBER OF TOP HASHTAGS TO LOOK AT
ana.topHashtags(tweets,TOP_N)

## Top User Mentions

In [None]:
TOP_N = 20 ## USE THIS NUMBER TO SET THE NUMBER OF TOP USER MENTIONS TO LOOK AT
ana.topUserMentions(tweets,TOP_N)