# Clustering Analysis Notebook

#### This notebook contains a demonstration of the tools necessary for conducting clustering on Twitter data. 

In [None]:
import trt_API.process as proc
import trt_API.analysis as ana
import trt_API.clustering as cluster
import trt_API.org_research as org 

import datetime
from sklearn.feature_extraction import text 

SORTED = False

## PREPROCESSING

### Variables for Analysis

In [None]:
# Set the path to the parent directory containing all Tweets of interest
DIRECTORY = './tweets/*'
# Set to True to isolate english language tweets
ENGLISH = True

### Load Tweet and Generate Dataframe

In [None]:
tweet_objects = proc.loadTweetObjects(DIRECTORY)
encoding = False
df = proc.convertTweetsToDataframe(tweet_objects, ENGLISH, encoding)

### Extract Potential Cashtags

In [None]:
ctdf = proc.extractPossibleCashtags(df)

### Removing Noisy Tweets

In [None]:
'''
*** Tweets often use popular hashtags with unrelated topics.
*** Noisy words can be identified to use to filter such tweets.
*** Enter these words below in the noisy_terms list.
'''
noisy_terms = []
cldf = proc.removeNoisyTerms(df, noisy_terms)
cldf['original_tweet'][cldf.original_tweet == 'None'] = cldf['tweet']
cldf['tweet'] = cldf['original_tweet']
del cldf['original_tweet']

### Remove Retweets

In [None]:
cldf_no_RT, cldf = proc.removeRetweets(cldf)

In [None]:
print(cldf_no_RT.shape[0])

## SPLITTING DATA

### By date

In [None]:
cldf, SORTED = ana.sortByDate(cldf,SORTED,PRINT_TOP=False,TOP=10)
begin = datetime.datetime(2018,8,1,0,00) #year, month, day, hour, minute
end = datetime.datetime(2019,2,24,23,59)
specific_range_cldf = cldf[cldf.date > begin]
specific_range_cldf = specific_range_cldf[specific_range_cldf.date < end]

### By terms

In [None]:
terms_of_interest = []
bsdf = proc.findTerms(specific_range_cldf, terms_of_interest)

## CLUSTERING

In [None]:
'''
*** Enter additional stop words here.
*** This will ignore the words rather than removing tweets as with noisy Tweets.
'''
ADDITIONAL_STOP_WORDS = ['rt','wa','http','ha','1','amp']
stop_words = text.ENGLISH_STOP_WORDS.union(org.STOP_WORDS)
stop_words = stop_words.union(ADDITIONAL_STOP_WORDS)

In [None]:
'''
*** Clustering requires the use of a number of parameters for tuning.
*** These are included below and should be set based on your project.
'''
n_FEATURES = 300
n_TOPICS = 10
n_TOP_WORDS = 10
n_TOP_TWEETS = 10
NGRAM = 3

In [None]:
tfidf, tfidf_feature_names = cluster.tfidf(specific_range_cldf, n_FEATURES, \
                                           NGRAM, stop_words)

In [None]:
km, kmeans_embedding = cluster.KMeans(tfidf, n_TOPICS)

In [None]:
cluster.printClusterResults(specific_range_cldf, km, tfidf, tfidf_feature_names,\
                   n_TOP_WORDS, n_TOPICS, n_TOP_TWEETS)

In [None]:
PERPLEXITY = 2.0
cluster.tSNE(kmeans_embedding, PERPLEXITY, n_TOPICS, title = \
             't-SNE Visualization of Data')

## Compute Inertia to Determine Optimal Clusters

In [None]:
LOOPS = 5
n_TOPICS_START = 5
n_TOPICS_END = 25
cluster.optimalClustersKMeans(tfidf, LOOPS, n_TOPICS_START, n_TOPICS_END)