In [1]:
import sys
sys.path.append("../") 
import databasepopulation
import communicationwmongo as commu
import home_location as home
import analysis as a
import pymongo
import my_h3_functions as myh3
import geopandas as gpd
import pandas as pd
from h3 import h3

# 0. Connect to Mongo and define a specific database

In [2]:
db=commu.connecttoLocaldb(database='twitter_bog')

# 1. Obtaining hexcounts from database

In [3]:
import time 
start=time.time()
df=a.hexcountsresults_to_df(db, save=False)
print(time.time()-start)

1.4988741874694824


In [14]:
df.time.value_counts(dropna=False)

NaT           20340
2012-12-31    13432
2015-03-31    12487
2015-06-30    11154
2013-03-31     9420
2014-12-31     9297
2014-06-30     9297
2013-09-30     9297
2014-09-30     9297
2013-12-31     9297
2014-03-31     9297
2013-06-30     9297
2012-09-30     8888
2015-09-30     8721
2015-12-31     6968
2016-03-31      570
Name: time, dtype: int64

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157059 entries, 0 to 157058
Data columns (total 8 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   _id                          157059 non-null  object        
 1   level_1                      157059 non-null  object        
 2   nonresidents                 136719 non-null  float64       
 3   nonresidentsandnonneighbors  136719 non-null  float64       
 4   residents                    20340 non-null   object        
 5   totalcounts                  136719 non-null  float64       
 6   time                         136719 non-null  datetime64[ns]
 7   period                       157059 non-null  int32         
dtypes: datetime64[ns](1), float64(3), int32(1), object(3)
memory usage: 10.2+ MB


# 2. Divide the sample time period in two and compute time changes

We will divide the period available time period in two, average the counts for the two resulting subperiods, and compute the changes between periods. The result is a dataframe at the hex_id level. The result is also a GeoDataFrame ready to plot

In [5]:
# Tweets are counted in time intervals defined in the tweets counts function. The default is quarterly data. 
# 1. Check first the available dates in the counts
df.time.unique()

array(['2012-12-31T00:00:00.000000000', '2013-03-31T00:00:00.000000000',
       '2013-06-30T00:00:00.000000000', '2013-09-30T00:00:00.000000000',
       '2013-12-31T00:00:00.000000000', '2014-03-31T00:00:00.000000000',
       '2014-06-30T00:00:00.000000000', '2014-09-30T00:00:00.000000000',
       '2014-12-31T00:00:00.000000000', '2015-03-31T00:00:00.000000000',
       '2015-06-30T00:00:00.000000000', '2015-09-30T00:00:00.000000000',
       '2015-12-31T00:00:00.000000000', '2012-09-30T00:00:00.000000000',
       '2016-03-31T00:00:00.000000000',                           'NaT'],
      dtype='datetime64[ns]')

In [6]:
# 2 Define a midpoint date to separate the two periods in the data. 
import datetime
gdfchanges=a.percent_change_two_periods_df(df, datebeforeandafterperiod=datetime.datetime(2013,6,30))

  df2dif=df2.groupby('_id')['nonresidents', 'nonresidentsandnonneighbors', 'residents', 'totalcounts'].diff(1)


KeyError: "Columns not found: 'residents'"

Coding \ _p0 is the average level of tweets counts in the first period \ _dif is the difference between periods \ _ch stands for % changes \ _chb is an alternative % change only for those hexs with more than 50 tweets.

In [None]:
gdfchanges.head(10)

In [None]:
# Plot example
gdfchanges.plot('nonresidents_dif')
gdfchanges.crs

## 3. Spatial join with census tracks (typologies) data

This requires:

    An available function changes the geometry of the gdf to the centroids
    Load your typologies data to a geodataframe
    Spatial join

In [None]:
#1 Get centroids
gdfchanges_points = myh3.df_with_hexid_to_centroids_gdf(gdfchanges, hexcolname='_id')

In [None]:
gdfchanges_points.plot()

In [None]:
# Load your typologies data as a geodataframe 
from pathlib import Path, PureWindowsPath
shp_path = Path("../../../../Box/Twitter data/Observation Shapefiles")

tracts = gpd.read_file(shp_path/"syd.geojson")
tracts = tracts.to_crs({'init': 'epsg:4326'})

In [None]:
tracts.plot()
tracts.crs

In [None]:
# # 3 Spatial Join between centroids and census track shapes
tweets_tracts=gpd.sjoin(gdfchanges_points, tracts, how="inner", op='intersects')

In [None]:
tweets_tracts.head()

In [None]:
typo_stats = tweets_tracts[['nonresidents_dif','nonresidents_ch', 'nonresidents_p0', 'standardized_types']].groupby('standardized_types').agg(['mean','count','sem'])
#nonresidents_p0
typo_stats

In [None]:
fig, ax = plt.subplots()
ax.bar(typo_stats.index, typo_stats['nonresidents_dif','mean'], yerr=1.96*typo_stats['nonresidents_dif','sem'], alpha=0.2)
plt.xticks(rotation=45)
plt.show()

In [None]:
import matplotlib.pyplot as plt

def barchart(tweets_tracts, metric, title):
    
    #Windsorizing series to tream outliers 
    tweets_tracts['nonresidents_dif']=tweets_tracts['nonresidents_dif'].clip(lower= -60, upper=1000)
    tweets_tracts['nonresidents_ch']=tweets_tracts['nonresidents_ch'].clip(upper=6)
    
    #tweets_tracts['nonresidents_ch'].hist()
    typo_stats=tweets_tracts[['nonresidents_dif','nonresidents_ch', 'nonresidents_p0', 'standardized_types']].groupby('standardized_types').agg(['mean','count','sem','std'])
    
    typo_stats=typo_stats.reset_index()
    
    typo_stats=typo_stats.loc[typo_stats.standardized_types!='NON_POP']
    typo_stats=typo_stats.loc[typo_stats.standardized_types!='NON POP']
    typo_stats=typo_stats.loc[typo_stats.standardized_types!='OTHER']
    
    
    #Sorting 
    typo_stats=typo_stats.sort_values(by=(metric,'mean'), ascending=False)
    
    fig, ax = plt.subplots()
    ax.bar(typo_stats['standardized_types'], typo_stats[metric,'mean'], yerr=1.96*typo_stats[metric,'sem'], alpha=0.2)
    plt.title(title)
    plt.xticks(rotation=45)
    plt.show()

In [None]:
tweets_tracts.loc[tweets_tracts.nonresidents_ch<8].nonresidents_ch.hist()

In [None]:
barchart(tweets_tracts, 'nonresidents_ch', title="Non residents percent change")

In [None]:
barchart(tweets_tracts, 'nonresidents_dif', title="Non residents differences")