# Mapping Analysis Notebook
# <font color='red'> *NOTE: Make sure to update the trt_API files!* </font>

#### This notebook contains a demonstration of the tools necessary for conducting descriptive analysis of the data. This includes things such as frequency analysis, descripting statistics and temporal frequency.

In [None]:
import trt_API.process as proc
import trt_API.analysis as ana

import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

SORTED = False

In [None]:
!pip3 install folium
import folium
import json
from folium import plugins

## Variables for Analysis

In [None]:
# Set the path to the parent directory containing all Tweets of interest
DIRECTORY = './tweets/*'
# Set to True to isolate english language tweets
ENGLISH = False

## Load Tweet and Generate Dataframe

In [None]:
tweet_objects = proc.loadTweetObjects(DIRECTORY)
encoding = True #Use False on Macs for better results
df = proc.convertTweetsToDataframe(tweet_objects, ENGLISH, encoding)

## Extract Potential Cashtags

In [None]:
ctdf = proc.extractPossibleCashtags(df)

## Removing Noisy Tweets

In [None]:
'''
*** Tweets often use popular hashtags with unrelated topics.
*** Noisy words can be identified to use to filter such tweets.
*** Enter these words below in the noisy_terms list.
'''
noisy_terms = []
cldf = proc.removeNoisyTerms(df, noisy_terms)

## Remove Retweets

In [None]:
cldf_no_RT, cldf = proc.removeRetweets(cldf)

In [None]:
print(cldf_no_RT.shape[0])

## SPLITTING DATA

### By date

In [None]:
cldf, SORTED = ana.sortByDate(cldf,SORTED,PRINT_TOP=False,TOP=10)
begin = datetime.datetime(2018,10,2,23,59) #year, month, day, hour, minute
end = datetime.datetime(2019,1,2,23,59)
specific_range_cldf = cldf[cldf.date > begin]
specific_range_cldf = specific_range_cldf[specific_range_cldf.date < end]

### By term

In [None]:
terms_of_interest = []
bsdf = proc.findTerms(cldf, terms_of_interest)

## MAPPING

In [None]:
with open('./trt_API/us-states.json',encoding='latin1') as f:
    usa = json.load(f)

In [None]:
citiestxt = open('./../resources/cities.txt','r')
cities = []
for line in citiestxt:
    line = line.split('\t')
    names = []
    names.append(line[2])
    names = names + line[3].split(',')
    for i in range(len(names)):
        names[i] = names[i].lower()
    names.append(line[4])
    names.append(line[5])    
    cities.append(names)

In [None]:
locations = []
for l in bsdf.location:
    try:
        l = l.split(',')
    except:
        print(l)
        continue
    for city in cities:
        if l[0].lower() in city:
            locations.append((float(city[-2]),float(city[-1])))
            break

In [None]:
locs = np.asarray(locations)

In [None]:
tweetsMap = folium.Map(location=[34.0522,-118.2437], tiles='Stamen Toner', zoom_start=9)
folium.GeoJson(usa).add_to(tweetsMap)
for i in range(0,len(locs)):
    folium.CircleMarker((locs[i,0],locs[i,1]), radius=3, weight=1, color='blue', \
                        fill_color='blue', fill_opacity=.5).add_to(tweetsMap)  
tweetsMap.add_children(plugins.HeatMap(data=locs, radius=5, blur=10))
tweetsMap.save('../tweetsMap.html')