In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from datetime import datetime,timedelta
import re
import time

from scipy import stats
import csv

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale, normalize
import seaborn.apionly as sns

In [2]:
data = pd.read_csv('balanced_merged.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
data.shape

(700000, 92)

In [4]:
data.ix[0,:]

Unnamed: 0                                                                                  0
advertiser_app_store_id                                                            1009442510
country_code                                                                               CN
city                                                                                      NaN
campaign_id                                                          56fc3b248409c5677800005c
creative_id                                                          582efacc4156ab1024000034
device_language                                                                            zh
device_make                                                                             iPad3
device_model                                                                                4
device_platform                                                                           iOS
device_connection                                           

### Convert timestamps to date format

In [None]:
cols = ['time_of_last_delivery_this_campaign', 'time_of_last_delivery_this_creative', 'time_of_last_delivery_any_installed_app',
       'time_of_last_delivery_any_installed_app','time_of_last_vungle_delivery', 'time_of_this_impression','time_of_this_request']
for col in cols:
#     print col
    name = col + "_weekday"
    data[name] =  pd.to_datetime(data[col]).apply(lambda x: x.weekday())


### get continent info from time zone

In [6]:
def get_region(x):
    if len(str(x).split('/')) > 1:
        return str(x).split('/')[0]
    else:
        return None

In [7]:
data['time_zone_region'] = data['time_zone'].apply(lambda x: get_region(x))

### standardize the format of device_language

In [8]:
data['device_language'] = data['device_language'].apply(lambda x: str(x).lower().split('-')[0])

### grouping insignificant data (sig level = 0.98)

In [9]:
def new_col_sig98(df, col):
    sig_level = 0.02
    df = df.copy()
    items = df[col].unique()
    for i in items:
        a = df[df[col] == i]['is_install']
        b = df[df[col] != i]['is_install']
        t, p = stats.ttest_ind(a, b, equal_var=False)
        if p >= sig_level:
            df[col][df[col] == i] = 'other'
    return df[col]

In [10]:
data.ix[0,:]

Unnamed: 0                                                                                         0
advertiser_app_store_id                                                                   1009442510
country_code                                                                                      CN
city                                                                                             NaN
campaign_id                                                                 56fc3b248409c5677800005c
creative_id                                                                 582efacc4156ab1024000034
device_language                                                                                   zh
device_make                                                                                    iPad3
device_model                                                                                       4
device_platform                                                                            

In [12]:
cols = ['advertiser_app_store_id', 'country_code', 'device_language', 'device_platform',
       'device_connection', 'time_zone_region', 'is_publisher', 'title', 'developer',
        'content_rating', 'current_version_user_ratings', 'user_rating', 
        'screenshot_urls', 'package_name', 'primary_category', 
        'has_in_app_purchases']
for col in cols:
    print col
    name = col + '_98'
    data[name] = new_col_sig98(data, col)

In [41]:
data

Unnamed: 0,advertiser_app_store_id,country_code,city,campaign_id,creative_id,device_language,device_make,device_model,device_platform,device_connection,...,time_of_this_request,time_zone,is_install,time_of_last_delivery_this_campaign_hour,time_zone_continent,time_of_last_delivery_this_creative_hour,time_of_last_delivery_any_installed_app_hour,time_of_last_vungle_delivery_hour,time_of_this_impression_hour,time_of_this_request_hour
0,1009442510,CN,,56fc3b248409c5677800005c,582efacc4156ab1024000034,zh,iPad3,4,iOS,wifi,...,2016-12-31 05:24:16,Asia/Shanghai,0,11.0,Asia,10.0,,4.0,11,5
1,5743f032a5a36ff4300000a5,ID,,581a0af506c6c9996c000073,58465ef52c19fc6503000081,id,LENOVO,Lenovo A6010,android,mobile,...,2016-12-29 10:56:03,Asia/Makassar,0,,Asia,,,10.0,4,10
2,727296976,CA,,554d597375fbbeb654000163,57767bf0ef4e66dc5f000088,en,iPad6,3,iOS,wifi,...,2016-12-29 01:19:54,America/Toronto,0,,America,,,1.0,1,1
3,57a28ffbb019f8257c00021d,LA,,580fe05d6000a03f2d00015f,584267696dea79c175000030,th,iPhone7,2,iOS,mobile,...,2016-12-30 06:17:36,Asia/Bangkok,0,4.0,Asia,4.0,,4.0,6,6
4,com.plarium.vikings,CZ,,585435605c9661f31200003c,585aef35a76e13401e0038c7,cs,Archos,Archos 55 diamond Selfie,android,wifi,...,2016-12-30 08:31:14,Europe/Prague,0,22.0,Europe,,,22.0,8,8
5,558bdb44bbed958866000191,CN,,56e93be0eccafb7e3500006b,57bc42dc627003b41f00006b,zh,iPhone7,2,iOS,,...,2016-12-29 01:33:40,Asia/Shanghai,0,1.0,Asia,,,1.0,1,1
6,com.skout.android,BR,,5260798dc76408c54f00000f,5600615480fbbf1028000184,pt,asus,ASUS_Z00VD,android,wifi,...,2016-12-29 05:53:27,America/Sao_Paulo,0,4.0,America,4.0,,4.0,5,5
7,58534782688691e41e000011,US,,5853f8293adda7a60c000024,5860c91d156bf31f650001b0,en,iPhone8,2,iOS,,...,2016-12-30 21:31:26,America/New_York,0,17.0,America,,,5.0,21,21
8,585215f054dbb89701000ba2,US,,585247d492f308b72a00015a,5860bde8156bf31f65000198,en,samsung,SM-T357T,android,wifi,...,2016-12-30 03:36:34,America/New_York,0,2.0,America,,,2.0,3,3
9,937718942,US,,582f9de0740cf4426b1d5338,5848d7a6b942517e03000113,en,iPad3,4,iOS,wifi,...,2016-12-30 22:21:11,America/Detroit,0,20.0,America,20.0,,20.0,2,22


In [14]:
data.columns

Index([u'Unnamed: 0', u'advertiser_app_store_id', u'country_code', u'city',
       u'campaign_id', u'creative_id', u'device_language', u'device_make',
       u'device_model', u'device_platform', u'device_connection',
       u'device_os_version', u'device_screen_height', u'device_screen_width',
       u'device_volume', u'event_id', u'n_campaign_views',
       u'n_vungle_installs', u'publisher_app_store_id',
       u'time_of_last_delivery_this_campaign',
       u'time_of_last_delivery_this_creative',
       u'time_of_last_delivery_any_installed_app',
       u'time_of_last_vungle_delivery', u'time_of_this_impression',
       u'timestamp_at_install', u'time_of_this_request', u'time_zone',
       u'is_install', u'time_of_last_delivery_this_campaign_date',
       u'time_of_last_delivery_this_creative_date',
       u'time_of_last_delivery_any_installed_app_date', u'time_zone_continent',
       u'time_of_last_vungle_delivery_date', u'time_of_this_impression_date',
       u'time_of_this_request

In [15]:
for i in data.columns[28:]:
    print 'column', i,'contains',sum(data[i].str.contains("other")),' others'

column time_of_last_delivery_this_campaign_date contains nan  others
column time_of_last_delivery_this_creative_date contains nan  others
column time_of_last_delivery_any_installed_app_date contains nan  others
column time_zone_continent contains nan  others
column time_of_last_vungle_delivery_date contains nan  others
column time_of_this_impression_date contains 0  others
column time_of_this_request_date contains 0  others
column advertiser_app_store_id_95 contains 47507  others
column country_code_95 contains nan  others
column device_language_95 contains 26706  others
column device_platform_95 contains 0  others
column device_connection_95 contains nan  others
column time_zone_continent_95 contains nan  others


In [17]:
data['time_of_last_delivery_this_campaign_date contains']

KeyError: 'time_of_last_delivery_this_campaign_date contains'