In [1]:
import os

os.chdir('../src/')

In [2]:
from modules.dataProcessing import getJoinedData

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
jumboData = getJoinedData( 
                cachedFile='../data/intermediate/jumboData.snappy.parquet'
            )   ## Data will be fetched if cached file is not found

In [4]:
jumboData[:5]

Unnamed: 0,capturedDate,business_vertical,country,region,city_code,strategy_id,channel_name,goal_type,total_spend_cpm,impressions,clicks,conversions
0,2018-09-17,Finance,Indonesia,Aceh Sm,idn-ac-banda aceh,3714868,Display,1.0,149.565425,48.0,0.0,0.0
1,2018-09-17,Finance,Indonesia,Aceh Sm,idn-ac-banda aceh,3715603,Display,1.0,291.516169,124.0,1.0,0.0
2,2018-09-17,Finance,Indonesia,Aceh Sm,idn-ac-banda aceh,3716560,Display,2.0,509.019147,415.0,0.0,0.0
3,2018-09-17,Finance,Indonesia,Aceh Sm,idn-ac-banda aceh,3718685,Mobile,2.0,134.542628,218.0,0.0,0.0
4,2018-09-17,Finance,Indonesia,Aceh Sm,idn-ac-banda aceh,3718702,Mobile,1.0,3.761886,1.0,0.0,0.0


In [6]:
jumboData['yearCaptured']   = jumboData['capturedDate'].apply(lambda dt: dt.year)
jumboData['YYYYMMCaptured'] = jumboData['capturedDate'].apply(lambda dt: '{}{:02d}'.format(dt.year, dt.month))

categoricalCols = ['business_vertical', 'country', 'region', 'city_code', 'strategy_id', 'channel_name', 
                   'goal_type', 'yearCaptured', 'YYYYMMCaptured', 'conversions']
numericalCols   = ['total_spend_cpm', 'impressions', 'clicks']

In [7]:
print('printing top 10 value counts for each categorical col..')
print('')

for c in categoricalCols:
    tmpDF         = jumboData[c].fillna('NA').value_counts().reset_index()
    tmpDF.columns = [c, 'nCounts']
    print('-'*15 + c + ' -> {}'.format(tmpDF.shape) + '-'*15)
    print(tmpDF[:10])
    print('')

printing top 10 value counts for each categorical col..

---------------business_vertical -> (2, 2)---------------
  business_vertical  nCounts
0           Finance    43062
1           Unknown    32143

---------------country -> (8, 2)---------------
            country  nCounts
0         Australia    34634
1         Indonesia    28378
2       New Zealand     6173
3          Malaysia     3203
4         Hong Kong     2375
5       Philippines      428
6  reserved/private       13
7           Somalia        1

---------------region -> (127, 2)---------------
              region  nCounts
0    New South Wales    12682
1           Victoria     8368
2         Queensland     6550
3         Jawa Timur     4398
4        Jawa Tengah     4304
5         Jawa Barat     3639
6       Jakarta Raya     3417
7  Western Australia     2805
8    South Australia     2676
9          hong kong     2375

---------------city_code -> (3894, 2)---------------
            city_code  nCounts
0     idn-jt-semarang  

In [8]:
print('describe stats value for each numerical col..')
print('')

for c in numericalCols:
    tmpDF         = jumboData[c].describe()
    print('-'*15 + c + ' -> null: {}'.format( len(jumboData[jumboData[c].isnull()]) ) + '-'*15)
    print(tmpDF)
    print('')

describe stats value for each numerical col..

---------------total_spend_cpm -> null: 0---------------
count     75205.000000
mean        424.826770
std        5275.184032
min           0.242250
25%           2.815800
50%           8.621250
75%          32.941426
max      374622.032272
Name: total_spend_cpm, dtype: float64

---------------impressions -> null: 1502---------------
count     73703.000000
mean        291.512313
std        4179.601279
min           0.000000
25%           1.000000
50%           4.000000
75%          18.000000
max      374343.000000
Name: impressions, dtype: float64

---------------clicks -> null: 1522---------------
count    73683.000000
mean         0.371049
std          7.118569
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max        661.000000
Name: clicks, dtype: float64



Below is to check if region matches to the correct region under each country

In [9]:
listOfCountries = jumboData['country'].fillna('NA').unique()

print('printing top 10 regions for each country..')
print('')

for country in listOfCountries:
    tmpDF         = jumboData[ jumboData['country'] == country ]
    tmpDF         = tmpDF['region'].fillna('NA').value_counts().reset_index()
    tmpDF.columns = ['region', 'nCounts']
    print('-'*15 + country + '-'*15)
    print(tmpDF)
    print('')

printing top 10 regions for each country..

---------------Indonesia---------------
                 region  nCounts
0            Jawa Timur     4398
1           Jawa Tengah     4304
2            Jawa Barat     3639
3          Jakarta Raya     3417
4                Banten     1297
5         Yogyakarta Jw     1163
6                  Bali     1117
7      Kalimantan Timur      681
8        Sumatera Barat      668
9      Sulawesi Selatan      666
10       Sumatera Utara      650
11     Sumatera Selatan      584
12       Kepulauan Riau      557
13                 Risu      514
14   Kalimantan Selatan      511
15              Aceh Sm      481
16       Sulawesi Utara      428
17  Nusa Tenggara Timur      418
18     Kalimantan Barat      390
19              Lampung      342
20    Kalimantan Tengah      300
21                Jambi      239
22      Sulawesi Tengah      198
23  Nusa Tenggara Barat      190
24     kalimantan utara      176
25             Bengkulu      169
26            Gorontalo  

Consider city_code without format XX-XXX-XXXX as non-compliant, list out all non-compliant city_code

In [10]:
jumboData['city_code'].apply(lambda c: 'compliant' if len(c.split('-')) == 3 else c).value_counts()

compliant                                  74889
idn-la-tanjungkarang-telukbetung             185
aus-qld-kippa-ring                            32
mys-10-subang jaya - usj 12 - 18              18
mys-10-subang jaya - usj 5 - 8                18
mys-10-subang jaya - usj 19 - 24              18
Unknown                                       13
aus-nsw-brighton-le-sands                      7
mys-05-jalan mutiara 1 - 3                     5
phl-cav-alapan i-a                             4
mys-12-tanjung aru - peti surat 1 - 690        4
phl-man-ayala-paseo de roxas                   4
phl-cav-balsahan-bisita                        2
phl-ceb-lapu-lapu                              2
mys-13-awat-awat                               1
phl-ils-an-annam                               1
phl-ceb-can-asujan                             1
mys-10-batu 9 cheras - peti surat              1
Name: city_code, dtype: int64