# General data questions and exploration


In [3]:
##TODO:
# Try ML with and without network metrics
# Test at different time periods
# Test other event datasets
# Check steps from book - change to best practice sklearn


# What correlates with witness label
# GPS accounts tend to be spam/businesses?
# Compare GPS from stream / profile / hand coding
# Explore age of account
# Is detecting co-occuring tags viable?
# What kind of data/user is likely to be deleted?
# Is user name change / user deletion/protection a useful predictor
# Compare Change in network, whether it's useful to collect.
# Check gps count, location in profile
# Check timezone distribution
# 'Ordinary person' vs bot/celeb/business/news -- using source field, tweet rate, timezone

# prop of gps -- users and tweets. Automated, instagram sourced?
# prop of sources
# prop of media/urls
# users with location on profile? Some set 'in solidarity'?
# Cycadian posting rythym - can identify real people vs bots?
# location via friend network?
# language

# \item Tweets which were automatically generated from Instagram posts were much more likely to include GPS coordinates, and as media, more likely to represent a ground truth. Therefore this content may be worth focusing on.
# \item Aid requests were very rare. Those that were identified were often reposts rather than originals, and are often referring to the same original message which begins to trend.
# \item Info for affected class should differentiate between immediate and non-immediate content. E.g. a call to mobilise a clean-up or rescue crew vs. a link to an insurance claim form.
# \item For `unrelated' messages, those which matched the keyword stream were highly represented by automated messages coming from a particular set of sources which presumably uses trending tags to gain exposure. This is easy to pre-filter.
# \item Geographically-tagged Tweets are predominantly either: Instagram cross-posts, or automatically generated job listings from a small set of sources (and therefore easy to pre-filter).
        

# Sum of network edge reciprocity
# k-cohesiveness -- Structural cohesion

In [29]:
### Initialisation ###
import os
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = [6, 4]

EVENT_NAME = Event.objects.all()[0].name.replace(' ', '')
DIR = './data/harvey_user_location/'
DF_FILENAME = 'df_users.csv'

# Confirm correct database is set in Django settings.py
if 'Harvey' not in EVENT_NAME:
    raise Exception('Event name mismatch -- check database set in Django')

In [30]:
# Open original Dataframe
users_df = pd.read_csv(DIR + DF_FILENAME, index_col=0)
users_df.shape

(1500, 46)

## Geographic Metadata and Manual Coding
Manual coding of users targetted the perceived locality of the user to the event. We can compare the geographic metadata provided by Twitter to these codes to determine their usefulness as a predictor for this value.

In [52]:
# has_tweet_from_locality
# is_local_profile_location

# is_local_timezone
# is_lang_en

# is_coded_as_witness
# is_coded_as_non_witness

In [277]:
# users_df.loc[users_df["is_local_profile_location"] == 1]["is_coded_as_witness"]

vals = users_df.loc[users_df["is_local_profile_location"] == 1]["is_coded_as_witness"].value_counts()
vals2 = users_df.loc[users_df["is_coded_as_witness"] == 1]["is_local_profile_location"].value_counts()


print('{:.4}% of {} users with local profile locations were coded as witness'.format(vals[1]/sum(vals)*100, sum(vals)))
print('{:.4}% of {} users were classified as having a local profile'.format(sum(vals)/len(users_df)*100, len(users_df)))
print('{:.4}% of {} witness codes had a local profile'.format(vals[1]/sum(vals)*100, sum(vals2)))
#vals2

64.99% of 397 users with local profile locations were coded as witness
26.47% of 1500 users were classified as having a local profile
64.99% of 386 witness codes had a local profile


In [223]:
import pandas as pd

def confusion_matrix(df: pd.DataFrame, col1: str, col2: str):
    """
    Given a dataframe with at least
    two categorical columns, create a 
    confusion matrix of the count of the columns
    cross-counts
    """
    return (
            df
            .groupby([col1, col2])
            .size()
            .unstack(fill_value=0)
            )


def calc_agreement_coefs(df: pd.DataFrame):
    """
    Calculates Cohen's Kappa and
    Krippendorff's Alpha for a
    given confusion matrix.
    """
    arr = df.to_numpy()
    n = arr.sum()
    p_o = 0
    for i in range(len(arr)):
        p_o += arr[i][i]/n
    p_e = 0
    for i in range(len(arr)):
        p_e += (arr.sum(axis=1)[i] *
                arr.sum(axis=0)[i]) / (n*n)
    kappa = (p_o-p_e)/(1-p_e)
    
    coin_arr = np.transpose(arr) + arr
    exp_distribution = [sum(x) for x in coin_arr]
    p_e_krippendorf = sum([a * (a-1) for a in exp_distribution])/(2*n*((2*n)-1))
    alpha = (p_o - p_e_krippendorf) / (1-p_e_krippendorf)
    
    return p_o, kappa, alpha


def calc_prec_recall(df: pd.DataFrame):
    """
    Calculates precision, recall and
    f-score for a given confusion matrix.
    
    Assumes true condition as ROW heading and
    ascending integer labels.
    """
    arr = df.to_numpy()
    if len(arr) != 2:
        return null
    results = {}
    results['Prevalence'] = arr.sum(axis=0)[1]/arr.sum()
    results['Accuracy'] = (arr[0][0] + arr[1][1])/arr.sum()
    results['Prec'] = arr[1][1]/arr.sum(axis=1)[1]
    results['Recall'] = arr[1][1]/arr.sum(axis=0)[1]
    results['f1Score'] = (2 * results['Prec'] * results['Recall'])/(results['Prec']+results['Recall'])
    results['Specificity'] = arr[0][0]/arr.sum(axis=0)[0]
    results['FalseNegRate'] = arr[0][1]/arr.sum(axis=0)[1]
    return results
    

In [227]:
conf = confusion_matrix(users_df, 'is_local_profile_location', 'is_coded_as_witness')
conf

is_coded_as_witness,0,1
is_local_profile_location,Unnamed: 1_level_1,Unnamed: 2_level_1
0,975,128
1,139,258


In [230]:
results = calc_prec_recall(conf)

p_o, kappa, alpha = calc_agreement_coefs(conf)
results['Cohen\'s Kappa'] = kappa
results['Krippendorff\'s Alpha'] = alpha

pd.DataFrame.from_dict(results, orient='index')

Unnamed: 0,0
Prevalence,0.257333
Accuracy,0.822
Prec,0.649874
Recall,0.668394
f1Score,0.659004
Specificity,0.875224
FalseNegRate,0.331606
Cohen's Kappa,0.538603
Krippendorff's Alpha,0.538725


In [244]:
conf = confusion_matrix(users_df, 'is_non_local_profile_location', 'is_coded_as_non_witness')
results = calc_prec_recall(conf)
p_o, kappa, alpha = calc_agreement_coefs(conf)
results['Cohen\'s Kappa'] = kappa
results['Krippendorff\'s Alpha'] = alpha
pd.DataFrame.from_dict(results, orient='index')

Unnamed: 0,0
Prevalence,0.722
Accuracy,0.694
Prec,0.892947
Recall,0.654663
f1Score,0.755461
Specificity,0.796163
FalseNegRate,0.345337
Cohen's Kappa,0.371632
Krippendorff's Alpha,0.346952


In [270]:
# users_df.loc[users_df["is_local_profile_location"] == 1]["is_coded_as_witness"]

vals = users_df.loc[users_df["is_local_profile_location"] == 1]["is_coded_as_witness"].value_counts()
vals2 = users_df.loc[users_df["is_coded_as_witness"] == 1]["is_local_profile_location"].value_counts()


print('{:.3}% of {} users with local profile locations were coded as witness'.format(vals[1]/sum(vals)*100, sum(vals)))
print('{:.3}% of {} users were classified as having a local profile'.format(sum(vals)/len(users_df)*100, len(users_df)))
print('{:.3}% of {} witness codes had a local profile'.format(vals[1]/sum(vals)*100, sum(vals2)))
#vals2

65.0% of 397 users with local profile locations were coded as witness
26.5% of 1500 users were classified as having a local profile
65.0% of 386 witness codes had a local profile


In [276]:
users_df.loc[users_df["is_coded_as_witness"] == 1]
#users_df.loc[users_df["is_local_profile_location"] == 1]

Unnamed: 0,added_at,centrality_betweenness,centrality_closeness,centrality_degree,centrality_eigenvector,centrality_load,centrality_undirected_eigenvector,created_at,default_profile,default_profile_image,...,account_age,day_of_detection,description_length,is_lang_en,has_translator_type,has_url,has_changed_screen_name,is_data_source_3,is_coded_as_witness,is_coded_as_non_witness
5,2017-08-26 02:13:06.809104+00:00,5.731425e-07,0.141014,0.000304,8.062092e-07,4.541555e-07,2.957769e-04,2012-08-23 18:34:19+00:00,1,0,...,1835,1,,1,0,0,0,1,1,0
7,2017-08-27 12:23:23.280713+00:00,,,,,,,2010-08-25 17:57:36+00:00,0,0,...,2564,2,130.0,1,0,1,0,1,1,0
17,2017-08-26 18:51:13.614464+00:00,0.000000e+00,0.000000,0.000061,1.764451e-52,0.000000e+00,6.314252e-04,2009-04-24 04:21:28+00:00,0,0,...,3053,1,154.0,1,0,1,0,1,1,0
32,2017-08-29 22:42:56.193578+00:00,,,,,,,2014-03-08 21:17:11+00:00,1,0,...,1273,4,125.0,1,0,0,0,1,1,0
33,2017-08-26 12:26:48.742614+00:00,2.589952e-03,0.197807,0.005528,3.330289e-03,2.527955e-03,9.876560e-03,2013-08-29 21:44:12+00:00,0,0,...,1464,1,125.0,1,0,0,0,0,1,0
40,2017-08-26 15:03:37.124131+00:00,1.021308e-04,0.176893,0.000243,1.058691e-04,1.066500e-04,5.427109e-04,2015-02-18 23:21:10+00:00,0,0,...,926,1,34.0,1,0,1,0,1,1,0
44,2017-08-27 04:47:31.659585+00:00,6.128523e-05,0.142966,0.001033,1.492774e-06,6.602367e-05,3.808750e-03,2010-12-21 05:00:23+00:00,1,0,...,2447,2,118.0,1,0,0,0,0,1,0
47,2017-08-29 00:35:24.310316+00:00,6.792954e-05,0.179065,0.001033,2.100564e-04,7.441400e-05,1.077327e-03,2009-05-26 22:02:47+00:00,0,0,...,3020,4,157.0,1,0,0,0,1,1,0
51,2017-08-26 14:59:23.625120+00:00,2.462279e-04,0.145853,0.000972,3.079477e-06,2.612296e-04,1.813336e-03,2013-08-22 14:25:25+00:00,1,0,...,1471,1,120.0,1,0,1,0,1,1,0
57,2017-08-26 12:54:18.582976+00:00,1.862631e-04,0.158961,0.001033,1.465418e-05,2.042437e-04,3.921818e-04,2014-09-22 03:30:33+00:00,0,0,...,1076,1,129.0,1,0,0,0,0,1,0


1    258
0    139
Name: is_coded_as_witness, dtype: int64
1    258
0    128
Name: is_local_profile_location, dtype: int64
