In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient

%matplotlib inline

mongo_client = MongoClient('18.236.138.158', 27016)
database_reference = mongo_client.twitter

In [2]:
from mongo_aggregation_verbs import *

In [61]:
database_reference.collection_names()

['users', 'instructor_test_group', 'tweets']

In [3]:
collection_reference = database_reference.instructor_test_group

In [4]:
collection_reference.count()

20000

In [5]:
source_is_instagram = { 'source' : '<a href="http://instagram.com" rel="nofollow">Instagram</a>' } #instagram http
source_is_not_instagram = { 'source' : {'$ne' : '<a href="http://instagram.com" rel="nofollow">Instagram</a>'} } 

In [6]:
(collection_reference.find(source_is_instagram).count(),
 collection_reference.find(source_is_not_instagram).count())

(1907, 18093)

## Tweet Locations

In [7]:
not_null = { '$ne' : None }
nonnull_geo = {'geo' : not_null }
keep_geo_and_text = { 'geo' : 1, 'text' : 1 , '_id' : 0}

match_insta = {
    MATCH : source_is_instagram
}

match_not_insta = {
    MATCH : source_is_not_instagram
}

In [8]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_instagram },
    { MATCH : nonnull_geo},
    { COUNT : "geo"}
])
next(cursor)

{'geo': 1907}

In [9]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_not_instagram },
    { MATCH : nonnull_geo},
    { COUNT : "geo"}
])
next(cursor)

{'geo': 1045}

In [10]:
def group_and_count(key):
    return { GROUP : {
                 "_id"   : key,
                 "count" : { "$sum" : 1 }
                }
           }

In [11]:
def parse_geo_from_tweets(tweets):
    tweets = pd.DataFrame(tweets)
    geo = pd.DataFrame(list(tweets['_id'].values))
    geo['count'] = tweets['count']
    return geo

In [12]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_not_instagram },
    { MATCH : nonnull_geo},
    group_and_count('$geo'),
    { MATCH : { "count" : { "$gt" : 14 } } }, 
    { SORT  : { "count" : -1 } }
])
not_insta = parse_geo_from_tweets(list(cursor))
not_insta

Unnamed: 0,coordinates,type,count
0,"[34.0522342, -118.2436849]",Point,206
1,"[37.3813444, -122.1802812]",Point,39
2,"[34.1425078, -118.255075]",Point,31
3,"[36.778261, -119.4179324]",Point,21
4,"[35.426667, -116.89]",Point,17
5,"[34.0508369, -118.263032]",Point,16
6,"[34.0194543, -118.4911912]",Point,15


In [13]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_instagram },
    { MATCH : nonnull_geo},
    group_and_count('$geo'),
    { MATCH : { "count" : { "$gt" : 14 } } }, 
    { SORT  : { "count" : -1 } }
])
insta = parse_geo_from_tweets(list(cursor))
insta

Unnamed: 0,coordinates,type,count
0,"[34.0522, -118.243]",Point,465
1,"[36.48863024, -119.72972051]",Point,37
2,"[34.09799334, -118.33866453]",Point,35
3,"[34.07305556, -118.39944444]",Point,29
4,"[34.0221, -118.481]",Point,27
5,"[34.0402214, -118.2545227]",Point,16
6,"[33.9442368, -118.3975983]",Point,15


In [14]:
import folium
starting_loc = [34.0689, -118.4452]
la_map = folium.Map(location=starting_loc, zoom_start=12)

In [15]:
for loc, count in not_insta[['coordinates','count']].values:
    popup = folium.Popup(str(count), parse_html=True)
    folium.Marker(loc, popup=popup, icon=folium.Icon(color='red')).add_to(la_map)
for loc, count in insta[['coordinates','count']].values:
    popup = folium.Popup(str(count), parse_html=True)
    folium.Marker(loc, popup=popup, icon=folium.Icon(color='blue')).add_to(la_map)



In [16]:
la_map

In [17]:
def parse_geo_from_tweets(tweets):
    tweets = pd.DataFrame(tweets)
    geo = pd.DataFrame(list(tweets['_id'].values))
    geo['count'] = tweets['count']
    return geo

In [18]:
#top 10 not instagram users in terms of twitter using. 
cursor = collection_reference.aggregate([
    { MATCH : source_is_not_instagram },
    { MATCH : nonnull_geo},
    group_and_count('$user.id'),
    { MATCH : { "count" : { "$gt" : 14 } } }, 
    { SORT  : { "count" : -1 } },
    { LIMIT : 10 }
])
not_insta_top_users = pd.DataFrame(list(cursor))
not_insta_top_users

Unnamed: 0,_id,count
0,4549072827,29
1,787687147,29
2,1414684496,27
3,3066057658,27
4,789990810,27
5,4191239027,25
6,21298660,21
7,3864064936,19
8,21298373,19
9,3380828067,17


In [19]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_instagram },
    { MATCH : nonnull_geo},
    group_and_count('$user.id'),
#     { MATCH : { "count" : { "$gt" : 10 } } }, 
    { SORT  : { "count" : -1 } },
    { LIMIT : 10 }
])
insta_top_users = pd.DataFrame(list(cursor))
insta_top_users

Unnamed: 0,_id,count
0,1455659006,10
1,613833206,8
2,843390093012353024,6
3,4561143733,6
4,19640448,5
5,226456467,5
6,37016954,4
7,760160463833313280,4
8,30723561,4
9,2267807461,4


In [20]:
not_insta_top_users_ids = not_insta_top_users._id.values
insta_top_users_ids = insta_top_users._id.values

In [21]:
not_insta_top_users_ids_list = list(not_insta_top_users_ids)
not_insta_top_users_ids_list = [int(i) for i in not_insta_top_users_ids_list]
insta_top_users_ids_list = list(insta_top_users_ids)
insta_top_users_ids_list = [int(i) for i in insta_top_users_ids_list]

In [22]:
def parse_geo_from_tweets(tweets):
    tweets = pd.DataFrame(tweets)
    tmp = pd.DataFrame(list(tweets['_id'].values))
    geo = pd.DataFrame(list(tmp['geo'].values))
    geo['user_id'] = tmp['user_id']
    geo['count'] = tweets['count']
    return geo

In [23]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_not_instagram },
    { MATCH : nonnull_geo},
    { PROJECT : { "user_id" : "$user.id", "geo" : 1, "text" : 1, "_id" :0 } },
    { MATCH : { "user_id" : { "$in" : not_insta_top_users_ids_list }}},
    group_and_count({"user_id" : "$user_id", "geo" : "$geo"}),
])

not_insta_top_user_geo = parse_geo_from_tweets(list(cursor))

In [24]:
cursor = collection_reference.aggregate([
    { MATCH : source_is_instagram },
    { MATCH : nonnull_geo},
    { PROJECT : { "user_id" : "$user.id", "geo" : 1, "text" : 1, "_id" :0 } },
    { MATCH : { "user_id" : { "$in" : insta_top_users_ids_list }}},
    group_and_count({"user_id" : "$user_id", "geo" : "$geo"}),
])
insta_top_user_geo = parse_geo_from_tweets(list(cursor))

In [25]:
not_insta_top_user_geo.head()

Unnamed: 0,coordinates,type,user_id,count
0,"[34.19743613, -118.58178967]",Point,4549072827,1
1,"[34.03491, -118.27746]",Point,4191239027,1
2,"[35.7476654, -118.060997]",Point,1414684496,1
3,"[34.0995, -118.32813]",Point,4191239027,1
4,"[34.187044, -118.3812562]",Point,789990810,1


In [26]:
insta_top_user_geo.head()

Unnamed: 0,coordinates,type,user_id,count
0,"[34.04453451, -118.26677639]",Point,226456467,1
1,"[34.06895637, -118.40267947]",Point,1455659006,1
2,"[34.0221, -118.481]",Point,1455659006,1
3,"[34.07305556, -118.39944444]",Point,1455659006,5
4,"[34.08718311, -118.46354276]",Point,19640448,1


In [27]:
colors_not_insta = {
    4549072827 : 'red',
    3066057658 : 'blue',
    1414684496 : 'green',
    21298660 : 'purple',
    3380828067 : 'orange',
    787687147 : 'darkred',
    21298373 : 'lightred',
    3864064936 : 'beige',
    4191239027 : 'darkblue',
    789990810 : 'darkgreen',
}

# colors_insta = {
#     760160463833313280 : '#0000ff',
#     30723561 : '#0010ff',
#     613833206 : '#0020ff',
#     2267807461 : '#0030ff',
#     4561143733 : '#0040ff',
#     1455659006 : '#0050ff',
#     37016954 : '#0060ff',
#     19640448 : '#0070ff',
#     843390093012353024 : '#0080ff',
#     226456467 : '#0090ff',
# }

In [28]:
not_insta_top_user_geo['color'] = not_insta_top_user_geo.user_id.apply(lambda x: colors_not_insta[x])
# insta_top_user_geo['color'] = insta_top_user_geo.user_id.apply(lambda x: colors_insta[x])

In [29]:
not_insta_top_user_geo.sample(10)

Unnamed: 0,coordinates,type,user_id,count,color
112,"[33.9850469, -118.4694832]",Point,789990810,1,darkgreen
18,"[38.7841682, -122.7385025]",Point,1414684496,1,green
75,"[34.03453, -118.31303]",Point,4191239027,1,darkblue
89,"[34.04821, -118.29924]",Point,4191239027,1,darkblue
43,"[38.7985001, -122.7646637]",Point,1414684496,1,green
9,"[34.17288, -118.48278]",Point,4191239027,1,darkblue
63,"[34.0617664, -118.4447863]",Point,21298660,1,purple
7,"[34.09425, -118.30502]",Point,4191239027,1,darkblue
54,"[37.6346664, -119.0261688]",Point,1414684496,1,green
91,"[34.2007012, -118.5670141]",Point,4549072827,1,red


In [30]:
starting_loc = [34.0689, -118.4452]
la_map = folium.Map(location=starting_loc, zoom_start=12)

In [42]:
for loc, color, count in not_insta_top_user_geo[['coordinates','color','count']].values:
    popup = folium.Popup(str(count), parse_html=True)
    if count < 3:
        folium.Marker(loc, popup=popup, icon=folium.Icon(color=color)).add_to(la_map)
#     else:
#         folium.Marker(loc, popup=popup, icon=folium.Icon(color=color, icon='warning')).add_to(la_map)
# for loc, count in insta[['coordinates','count']].values:
#     folium.Marker(loc, popup=popup, icon=folium.Icon(color='blue')).add_to(la_map)



In [32]:
la_map

In [54]:
cur = collection_reference.find({"user.id" : 1414684496})

In [55]:
tw = list(cur)

In [57]:
pd.DataFrame(tw)

Unnamed: 0,_id,contributors,coordinates,created_at,entities,favorite_count,favorited,filter_level,geo,id,...,possibly_sensitive,quote_count,reply_count,retweet_count,retweeted,source,text,timestamp_ms,truncated,user
0,5a6e640336dd5f000169666a,,"{'type': 'Point', 'coordinates': [-119.0261688...",Mon Jan 29 00:00:03 +0000 2018,"{'hashtags': [{'text': 'earthquake', 'indices'...",0,False,low,"{'type': 'Point', 'coordinates': [37.6346664, ...",957765246336172033,...,False,0,0,0,False,"<a href=""http://everyearthquake.com"" rel=""nofo...",USGS reports a M1.29 #earthquake 4km W of Mamm...,1517184003801,False,"{'id': 1414684496, 'id_str': '1414684496', 'na..."
1,5a5464b247b2a100b5089689,,"{'type': 'Point', 'coordinates': [-116.6846667...",Tue Jan 09 06:44:02 +0000 2018,"{'hashtags': [{'text': 'earthquake', 'indices'...",0,False,low,"{'type': 'Point', 'coordinates': [33.6255, -11...",950619154695909376,...,False,0,0,0,False,"<a href=""http://everyearthquake.com"" rel=""nofo...",USGS reports a M0.56 #earthquake 8km N of Anza...,1515480242786,False,"{'id': 1414684496, 'id_str': '1414684496', 'na..."
2,5a532aa247b2a100b507c97d,,"{'type': 'Point', 'coordinates': [-116.8243333...",Mon Jan 08 08:24:02 +0000 2018,"{'hashtags': [{'text': 'earthquake', 'indices'...",0,False,low,"{'type': 'Point', 'coordinates': [33.568, -116...",950281931865731074,...,False,0,0,0,False,"<a href=""http://everyearthquake.com"" rel=""nofo...",USGS reports a M0.51 #earthquake 14km W of Anz...,1515399842598,False,"{'id': 1414684496, 'id_str': '1414684496', 'na..."
3,5a77660615ba4c0001556e66,,"{'type': 'Point', 'coordinates': [-122.8183365...",Sun Feb 04 19:59:01 +0000 2018,"{'hashtags': [{'text': 'earthquake', 'indices'...",0,False,low,"{'type': 'Point', 'coordinates': [38.8153343, ...",960241303769399296,...,False,0,0,0,False,"<a href=""http://everyearthquake.com"" rel=""nofo...",USGS reports a M0.52 #earthquake 7km NW of The...,1517774341896,False,"{'id': 1414684496, 'id_str': '1414684496', 'na..."
4,5a70e57e36dd5f00016aecf2,,"{'type': 'Point', 'coordinates': [-122.7996674...",Tue Jan 30 21:37:02 +0000 2018,"{'hashtags': [{'text': 'earthquake', 'indices'...",0,False,low,"{'type': 'Point', 'coordinates': [38.8264999, ...",958454029066973184,...,False,0,0,0,False,"<a href=""http://everyearthquake.com"" rel=""nofo...",USGS reports a M1 #earthquake 7km NW of The Ge...,1517348222401,False,"{'id': 1414684496, 'id_str': '1414684496', 'na..."
5,5a5704e236dd5f00015bac38,,"{'type': 'Point', 'coordinates': [-122.7646637...",Thu Jan 11 06:32:02 +0000 2018,"{'hashtags': [{'text': 'earthquake', 'indices'...",0,False,low,"{'type': 'Point', 'coordinates': [38.7985001, ...",951340908959551488,...,False,0,0,0,False,"<a href=""http://everyearthquake.com"" rel=""nofo...",USGS reports a M0.47 #earthquake 2km NNW of Th...,1515652322412,False,"{'id': 1414684496, 'id_str': '1414684496', 'na..."
6,5a69afca36dd5f000166c852,,"{'type': 'Point', 'coordinates': [-117.4976654...",Thu Jan 25 10:22:01 +0000 2018,"{'hashtags': [{'text': 'earthquake', 'indices'...",0,False,low,"{'type': 'Point', 'coordinates': [33.7456665, ...",956472218208256001,...,False,0,0,0,False,"<a href=""http://everyearthquake.com"" rel=""nofo...",USGS reports a M2.13 #earthquake Greater Los A...,1516875721872,False,"{'id': 1414684496, 'id_str': '1414684496', 'na..."
7,5a7ccd1315ba4c000158eed8,,"{'type': 'Point', 'coordinates': [-122.8151703...",Thu Feb 08 22:20:02 +0000 2018,"{'hashtags': [{'text': 'earthquake', 'indices'...",0,False,low,"{'type': 'Point', 'coordinates': [38.821167, -...",961726343133081600,...,False,0,0,0,False,"<a href=""http://everyearthquake.com"" rel=""nofo...",USGS reports a M0.56 #earthquake 7km NW of The...,1518128402871,False,"{'id': 1414684496, 'id_str': '1414684496', 'na..."
8,5a7b456f15ba4c000157ec4b,,"{'type': 'Point', 'coordinates': [-116.0821686...",Wed Feb 07 18:29:03 +0000 2018,"{'hashtags': [{'text': 'earthquake', 'indices'...",0,False,low,"{'type': 'Point', 'coordinates': [33.7556648, ...",961305824005054465,...,False,0,0,0,False,"<a href=""http://everyearthquake.com"" rel=""nofo...",USGS reports a M1.1 #earthquake 11km NE of Coa...,1518028143298,False,"{'id': 1414684496, 'id_str': '1414684496', 'na..."
9,5a52f13647b2a100b5079d74,,"{'type': 'Point', 'coordinates': [-118.9056702...",Mon Jan 08 04:19:02 +0000 2018,"{'hashtags': [{'text': 'earthquake', 'indices'...",0,False,low,"{'type': 'Point', 'coordinates': [37.6741676, ...",950220274464522241,...,False,0,0,0,False,"<a href=""http://everyearthquake.com"" rel=""nofo...",USGS reports a M1 #earthquake 8km ENE of Mammo...,1515385142328,False,"{'id': 1414684496, 'id_str': '1414684496', 'na..."


In [58]:
pd.DataFrame(tw).text

0     USGS reports a M1.29 #earthquake 4km W of Mamm...
1     USGS reports a M0.56 #earthquake 8km N of Anza...
2     USGS reports a M0.51 #earthquake 14km W of Anz...
3     USGS reports a M0.52 #earthquake 7km NW of The...
4     USGS reports a M1 #earthquake 7km NW of The Ge...
5     USGS reports a M0.47 #earthquake 2km NNW of Th...
6     USGS reports a M2.13 #earthquake Greater Los A...
7     USGS reports a M0.56 #earthquake 7km NW of The...
8     USGS reports a M1.1 #earthquake 11km NE of Coa...
9     USGS reports a M1 #earthquake 8km ENE of Mammo...
10    USGS reports a M1.06 #earthquake 8km NE of Agu...
11    USGS reports a M0.68 #earthquake 10km ENE of M...
12    USGS reports a M1 #earthquake 7km NW of The Ge...
13    USGS reports a M1.04 #earthquake 6km NW of The...
14    USGS reports a M3.07 #earthquake 43km NW of St...
15    USGS reports a M0.78 #earthquake 5km WNW of Th...
16    USGS reports a M0.81 #earthquake 15km SSW of L...
17    USGS reports a M1.09 #earthquake 9km ENE o