# RevealMobile Data Analysis

RevealMobile is a third-party data provider for X-Mode. I conducted an audit on a sample of data to verify that their data quality was up to X-Mode's standards.

In [1]:
import pandas as pd
import numpy as np
from geopy import distance
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("bf90ccb07e861a6390d95b90db91f5f6.csv", names=["app_id", "ad_id", 'platform', 'latitude', 'longitude', 'horizontal_accuracy', 'location_at', 'email', 'ipv_4', 'user_agent', 'country', 'battery', 'background', 'network', 'venue_name', 'venue_category', 'vertical_accuracy', 'speed', 'heading', 'publisher_id', 'dwell_time', 'person_id', 'client_id', 'sdk_version', 'altitude', 'ipv_6', 'mobile_number', 'captured_at', 'created_at', 'floor', 'carrier', 'manufacturer', 'device_model', 'wifi_ssid', 'wifi_bssid', 'decorated_at', 'dist_moved', 'day_number', 'day_type', 'time_type', 'dwell_type', 'confidence', 'brand_name', 'source', 'tech_signals', 'geo_country', 'misc'])

In [3]:
df['captured_at'] = pd.to_datetime(df['captured_at'])
df['created_at'] = pd.to_datetime(df['created_at'])

In [4]:
df.head(3)

Unnamed: 0,app_id,ad_id,platform,latitude,longitude,horizontal_accuracy,location_at,email,ipv_4,user_agent,...,day_number,day_type,time_type,dwell_type,confidence,brand_name,source,tech_signals,geo_country,misc
0,32,E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1,android,46.56025,-87.61569,4.0,2018-05-08T11:35:51.000Z,,166.137.12.52,android,...,,,,,,,,,36,E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1
1,32,D9D309B3-B2BB-4C7D-AA26-AB31BC62DE65,android,46.438576,-87.59077,11.0,2018-05-08T10:10:20.000Z,,174.255.9.57,android,...,,,,,,,,,36,D9D309B3-B2BB-4C7D-AA26-AB31BC62DE65
2,32,7706F01D-0FCD-4134-95F3-0E52F0484751,android,45.709415,-87.57934,7.0,2018-05-08T02:05:07.000Z,,71.86.176.236,android,...,,,,,,,,,36,7706F01D-0FCD-4134-95F3-0E52F0484751


In [5]:
df.shape

(1747, 47)

In [6]:
trash = []
for column in df.columns:
    values = df[column].value_counts()
    if values.size == 0:
        trash.append(column)
small_df = df.drop(trash, axis=1)

In [7]:
small_df['captured_at_diff'] = small_df.apply(lambda x: x['captured_at'] - x['created_at'], axis=1)
small_df[small_df['captured_at_diff'] > pd.Timedelta(0)] # no difference between captured_at and created_at

Unnamed: 0,app_id,ad_id,platform,latitude,longitude,horizontal_accuracy,location_at,ipv_4,user_agent,country,...,dist_moved,day_number,day_type,time_type,dwell_type,confidence,source,geo_country,misc,captured_at_diff


In [8]:
small_df.drop(['captured_at_diff', 'captured_at', 'created_at'], axis=1, inplace=True)
small_df.head()

Unnamed: 0,app_id,ad_id,platform,latitude,longitude,horizontal_accuracy,location_at,ipv_4,user_agent,country,...,client_id,dist_moved,day_number,day_type,time_type,dwell_type,confidence,source,geo_country,misc
0,32,E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1,android,46.56025,-87.61569,4.0,2018-05-08T11:35:51.000Z,166.137.12.52,android,US,...,19,,,,,,,,36,E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1
1,32,D9D309B3-B2BB-4C7D-AA26-AB31BC62DE65,android,46.438576,-87.59077,11.0,2018-05-08T10:10:20.000Z,174.255.9.57,android,US,...,19,,,,,,,,36,D9D309B3-B2BB-4C7D-AA26-AB31BC62DE65
2,32,7706F01D-0FCD-4134-95F3-0E52F0484751,android,45.709415,-87.57934,7.0,2018-05-08T02:05:07.000Z,71.86.176.236,android,US,...,19,,,,,,,,36,7706F01D-0FCD-4134-95F3-0E52F0484751
3,32,E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1,android,46.551,-87.61912,3.0,2018-05-08T15:10:52.000Z,166.137.12.52,android,US,...,19,,,,,,,,36,E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1
4,32,AE6FC5FF-937B-4269-BD52-23430249603E,android,46.558456,-87.41477,17.538,2018-05-08T08:05:44.000Z,71.87.135.133,android,US,...,19,,,,,,,,36,AE6FC5FF-937B-4269-BD52-23430249603E


The dataframe is much narrower now.

In [9]:
small_df.columns

Index(['app_id', 'ad_id', 'platform', 'latitude', 'longitude',
       'horizontal_accuracy', 'location_at', 'ipv_4', 'user_agent', 'country',
       'network', 'venue_name', 'venue_category', 'publisher_id', 'client_id',
       'dist_moved', 'day_number', 'day_type', 'time_type', 'dwell_type',
       'confidence', 'source', 'geo_country', 'misc'],
      dtype='object')

In [10]:
small_df.describe()

Unnamed: 0,app_id,latitude,longitude,horizontal_accuracy,client_id,dist_moved,confidence,source
count,1747.0,1747.0,1747.0,1745.0,1747.0,1.0,2.0,341.0
mean,32.0,46.11981,-87.6263,13.038237,19.0,112241.46,0.591209,36.0
std,0.0,1.724508,0.904636,20.224357,0.0,,0.578118,0.0
min,32.0,27.80111,-92.46636,3.0,19.0,112241.46,0.182418,36.0
25%,32.0,46.533707,-87.615944,6.0,19.0,112241.46,0.386814,36.0
50%,32.0,46.560123,-87.61572,10.0,19.0,112241.46,0.591209,36.0
75%,32.0,46.56025,-87.613754,14.0,19.0,112241.46,0.795605,36.0
max,32.0,46.59148,-82.33359,324.638,19.0,112241.46,1.0,36.0


All points have an app_id, latitude, longitude, and horizontal_accuracy.

In [11]:
df[~df['venue_category'].isnull()]

Unnamed: 0,app_id,ad_id,platform,latitude,longitude,horizontal_accuracy,location_at,email,ipv_4,user_agent,...,day_number,day_type,time_type,dwell_type,confidence,brand_name,source,tech_signals,geo_country,misc
190,32,E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1,android,46.5496,-87.4606,7.0,2018-05-13T22:17:04.000Z,,166.137.12.63,android,...,,,,,,,,,36,E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1
930,32,E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1,android,46.5491,-87.4602,7.0,2018-05-13T22:17:04.000Z,,166.137.12.63,android,...,,,,,,,,,36,E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1
1696,32,BC6C6C8F-67D2-405A-8EC6-C34AF2B4DF52,android,40.638493,-82.33366,7.585,2018-05-25T19:26:47.000Z,,174.233.12.30,android,...,FRIDAY,WEEKDAY,EVENING,MOVING,0.182418,,,,36,BC6C6C8F-67D2-405A-8EC6-C34AF2B4DF52


## Duplicate lat-lon Analysis

There were some suspicious centroids of data. I look into it further here.

In [12]:
bad_longs = df['longitude'].value_counts().head(15).index
bad_lats = df['latitude'].value_counts().head(15).index

In [13]:
bad_ids = set()
for i in range(len(bad_longs)):
    temp = df[(df['longitude'] == bad_longs[i])]
    print(temp['ad_id'].value_counts())
    bad_ids.add(temp['ad_id'].value_counts().index[0])
    print(temp['longitude'].value_counts())
    print(temp['latitude'].value_counts())

E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1    84
Name: ad_id, dtype: int64
-87.61571    84
Name: longitude, dtype: int64
46.560116    69
46.560250    10
46.560177     4
46.560303     1
Name: latitude, dtype: int64
E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1    80
Name: ad_id, dtype: int64
-87.616066    80
Name: longitude, dtype: int64
46.560417    80
Name: latitude, dtype: int64
E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1    65
Name: ad_id, dtype: int64
-87.615845    65
Name: longitude, dtype: int64
46.560100    54
46.560280     6
46.559914     3
46.560184     2
Name: latitude, dtype: int64
E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1    64
Name: ad_id, dtype: int64
-87.61568    64
Name: longitude, dtype: int64
46.560196    45
46.560207     5
46.560314     5
46.560184     4
46.560270     3
46.559900     1
46.560110     1
Name: latitude, dtype: int64
E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1    63
Name: ad_id, dtype: int64
-87.61578    63
Name: longitude, dtype: int64
46.560200    43
46.560123    14
46.560345     4
46.

In [14]:
print(bad_ids)

{'E031FB25-1BF4-48F6-80AD-B4CB7D61F5A1'}


In [15]:
df['location_at'] = pd.to_datetime(df['location_at'])

In [16]:
foo = df['location_at'].value_counts()
foo = foo[foo > 1]

In [17]:
bad_timestamp_ids = []
for timestamp in foo.values:
    bar = df[df['location_at'] == pd.to_datetime(timestamp)]
    baz = bar['ad_id'].value_counts()
    bad_timestamp_ids.extend(baz[baz > 1].values)
print(bad_timestamp_ids)

[]


In [18]:
bad_point = df[(df['longitude'] == -83.362072) & (df['latitude'] == 35.916396)]

In [19]:
bad_point['ad_id'].value_counts()

Series([], Name: ad_id, dtype: int64)

In [20]:
bad_guy = df[df['ad_id'] == "F6FA3BC6-CBEF-4B82-8FC7-F9D9B5ED5747"]

In [21]:
longs = bad_guy['longitude'].value_counts()
bad_guy['count'] = bad_guy['longitude'].apply(lambda x: longs[x])