# Mapping Analysis Notebook
# <font color='red'> *NOTE: Make sure to update the trt_API files!* </font>

#### This notebook contains a demonstration of the tools necessary for conducting descriptive analysis of the data. This includes things such as frequency analysis, descripting statistics and temporal frequency.

In [1]:

import folium
import json
from folium import plugins



## Variables for Analysis

In [2]:
import trt_API.process as proc
import trt_API.process as cleanText
import trt_API.analysis as ana

import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

SORTED = False

In [3]:
# Set the path to the parent directory containing all Tweets of interest
DIRECTORY = './tweets/*'
# Set to True to isolate english language tweets
ENGLISH = False

## Load Tweet and Generate Dataframe

In [4]:
tweet_objects = proc.loadTweetObjects(DIRECTORY)
encoding = True #Use False on Macs for better results
df = proc.convertTweetsToDataframe(tweet_objects, ENGLISH, encoding)

Loaded utf-8 df.
Initial size: 16042
Dropping duplicates...
Final size: 10472


## Extract Potential Cashtags

In [5]:
ctdf = proc.extractPossibleCashtags(df)
pd.set_option('display.max_rows', None)

Total potential Cashtags: 25


## Removing Noisy Tweets

In [6]:
'''
*** Tweets often use popular hashtags with unrelated topics.
*** Noisy words can be identified to use to filter such tweets.
*** Enter these words below in the noisy_terms list.
'''
noisy_terms = []
cldf = proc.removeNoisyTerms(df, noisy_terms)
print(cldf)

Removed 0 noisy terms.


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [8]:
#removing unwanted characters from tweets
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–", "ð", "https"]
for char in spec_chars:
    cldf['tweet'] = cldf['tweet'].str.replace(char, ' ')
cldf[500:2000]['tweet']

743     guys it s true  which of you did it happen  😋😜...
745     RT  MajidMorabbayan  Football festival in Russ...
747           EDWANmusic heyedwanEdwan    t co NEHI5yJfWz
749     RT RT RT lueurphoto  KCAMexico  DemiLovato  Ec...
755     RT  iHarryEmpires  Follow everyone who retweet...
757     RT  bembadiaby   nothing much to say   KimKard...
759     RT  90shope  spring day becomes work of art wi...
761     RT  Olympics  Happy  NationalSunglassesDay 😎  ...
763     RT  NICKIMINAJROYAL  Rap’s Biggest Acts On Spo...
765     RT  steveaoki  this video is AMAZING      Musi...
767     RT  Olympics  Happy  NationalSunglassesDay 😎  ...
769     RT  Rec7Media   KCAMexico  TheLastofUsPartII  ...
771      유겸  Yugyeom  Got7 “The New Era” Japan Tour Ph...
773     RT  Olympics  Happy  NationalSunglassesDay 😎  ...
775     RT  Olympics  Happy  NationalSunglassesDay 😎  ...
777     RT  NCTsmtown  EYES SEE A RAINY DAY  theEYESof...
779     RT  NCTsmtown  EYES SEE A RAINY DAY  theEYESof...
781     RT  NC

## Remove Retweets

In [9]:
cldf_no_RT, cldf = proc.removeRetweets(cldf)
print(cldf[300:500])

Removed 4110 duplicates.
                        date  followers         username  \
438  Jun 23 17:39:32 2018\t0        178         albukery   
440  Jun 23 17:48:48 2018\t0        117        p_pearl_l   
442  Jun 23 18:45:27 2018\t0         51        yeo081518   
444  Jun 23 20:44:14 2018\t0          1       billiezara   
450  Jun 23 21:42:40 2018\t0        118        1231Berry   
452  Jun 23 22:55:35 2018\t0          0     reyesvotesx2   
458  Jun 03 01:13:41 2018\t0       1580          jdbxgod   
459  Jun 03 02:10:56 2018\t0         73      Sseunie_328   
461  Jun 03 03:15:43 2018\t0        434     gomaengii624   
462  Jun 03 04:15:12 2018\t0        348       Rinrada061   
463  Jun 03 05:13:47 2018\t0         17       Rina417798   
465  Jun 03 05:21:24 2018\t0          5    Nongz_tizterz   
466  Jun 03 08:08:05 2018\t0          9          jb_jhob   
467  Jun 03 08:58:49 2018\t0         81   BarboniEdoardo   
468  Jun 03 10:14:44 2018\t0          5          PBKj_FK   
469  Jun 03 10:

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RT'][df.tweet.astype(str).str[0:2] == 'RT'] = df.tweet.str.split(':',expand=True).iloc[:,0]


In [10]:
print(cldf_no_RT.shape[0])

6362


## SPLITTING DATA

### By date

In [11]:
cldf, SORTED = ana.sortByDate(cldf,SORTED,PRINT_TOP=False,TOP=10)
begin = datetime.datetime(2016,10,2,23,59) #year, month, day, hour, minute
end = datetime.datetime(2019,1,2,23,59)
specific_range_cldf = cldf[cldf.date > begin]
specific_range_cldf = specific_range_cldf[specific_range_cldf.date < end]

### By term

In [12]:
terms_of_interest = ["Trump"]
bsdf = proc.findTerms(cldf, terms_of_interest)

bsdf[:10]

Found 29 terms of interest.


  df.good = df['tweet'].str.lower().str.contains(\


Unnamed: 0,date,followers,username,location,tweet,id,original_tweet,RT
14003,2018-04-10 19:09:24,4213,Wriseup,"Vienna, Austria",RT annaleclaire oligarch MSM Congress Tr...,983784027592318976,#oligarch #MSM #Congress #TrumpAdministration ...,RT annaleclaire oligarch MSM Congress Tr...
13383,2018-04-23 21:48:50,1,LouiseCUSA,,RT Stump for Trump We love you too KanyeWes...,988535192557154304,We love you too @KanyeWest! https://t.co/yEdbZ...,RT Stump for Trump We love you too KanyeWes...
14186,2018-04-25 23:33:41,225,Ze_Sequela,olha pra trás,kanyewest good kayne I don t support Donald...,989286354675752960,Kanye will never run in the race of popular op...,
14187,2018-04-25 23:49:28,373,SpaceKujira,Outer Space,RT Mamesdaughter dylanmsmitty kanyewest r...,989290326668947456,@dylanmsmitty @kanyewest @realDonaldTrump Trum...,RT Mamesdaughter dylanmsmitty kanyewest r...
13919,2018-04-26 01:43:48,223,_Christianrgs,"Pinhalzinho, Brasil",RT kanyewest You don t have to agree with tr...,989319099607076864,You don't have to agree with trump but the mob...,RT kanyewest You don t have to agree with tr...
14059,2018-05-01 07:52:08,1254,K1NDA0UTTALUCK,||170815||190817||300618||,RT kanyewest You don t have to agree with tr...,991223733032554497,You don't have to agree with trump but the mob...,RT kanyewest You don t have to agree with tr...
2858,2018-05-31 00:04:03,0,SJones014,,realDonaldTrump KimKardashian Well He do ...,1001977571976806400,,
2859,2018-05-31 01:05:00,346,1DGROWTH,,RT realDonaldTrump Great meeting with KimKa...,1001992910550814720,"Great meeting with @KimKardashian today, talke...",RT realDonaldTrump Great meeting with KimKa...
2862,2018-05-31 06:29:45,2681,AndreaMinuz,,RT realDonaldTrump Great meeting with KimKa...,1002074636559974400,"Great meeting with @KimKardashian today, talke...",RT realDonaldTrump Great meeting with KimKa...
5132,2018-06-01 19:08:13,4104,sweevtener,honeymoon ãve,RT realDonaldTrump Great meeting with KimKa...,1002627898837266432,"Great meeting with @KimKardashian today, talke...",RT realDonaldTrump Great meeting with KimKa...


## MAPPING

In [13]:
with open('./trt_API/us-states.json',encoding='latin1') as f:
    usa = json.load(f)

In [14]:
citiestxt = open('./../resources/cities.txt','r')
cities = []
for line in citiestxt:
    line = line.split('\t')
    names = []
    names.append(line[2])
    names = names + line[3].split(',')
    for i in range(len(names)):
        names[i] = names[i].lower()
    names.append(line[4])
    names.append(line[5])    
    cities.append(names)

In [15]:
def ngrams(string, n=3):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

In [16]:
import pandas as pd 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct  # Leading Juice for us
import time

pd.set_option('display.max_colwidth', -1)

print(bsdf)
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(location)
# print(tf_idf_matrix[0])

ValueError: numpy.ndarray size changed, may indicate binary incompatibility. Expected 88 from C header, got 80 from PyObject

In [None]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)
    return csr_matrix((data,indices,indptr),shape=(M,N))

In [None]:
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

In [None]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()

    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

matches_df = get_matches_df(matches, location, top=200)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # For removing all exact matches
matches_df.sample(1000)

In [None]:
count=0
locData={}
print(bsdf.location)
for i, ele  in enumerate(bsdf.location):
    
    count=0
    for j, item in enumerate(bsdf.location):
        if(ele == item and (item!="None" or ele!="None")):
            count+=1
            
        if count>=2 and j<=len(bsdf.location)-1:
            locData[ele]=count           
       
group_data = list(locData.values())
group_names = list(locData.keys())
group_mean = np.mean(group_data)

fig, ax = plt.subplots()
ax.barh(group_names, group_data)

for l in bsdf.location:
    try:
        l = l.split(',')
    except:
        continue
    for city in cities:
        if l[0].lower() in city:
            locations.append((float(city[-2]),float(city[-1])))
            break

In [None]:
locs = np.asarray(locations)

In [None]:
tweetsMap = folium.Map(location=[34.0522,-118.2437], tiles='Stamen Toner', zoom_start=9)
folium.GeoJson(usa).add_to(tweetsMap)
for i in range(0,len(locs)):
    folium.CircleMarker((locs[i,0],locs[i,1]), radius=3, weight=1, color='blue', \
                        fill_color='blue', fill_opacity=.5).add_to(tweetsMap)  
tweetsMap.add_child(plugins.HeatMap(data=locs, radius=5, blur=10))
tweetsMap.save('../tweetsMap.html')