## Read the user data file

- if no user data, then read the original data and create it

In [1]:
import pandas as pd
import numpy as np
import pickle

location='./data/'
clickfilename = 'train'
userdatafilename = 'train_users_only'

converters = {"site_id": lambda x: int(x, 16),
              "site_domain": lambda x: int(x, 16),
              "site_category": lambda x: int(x, 16),
              "app_id": lambda x: int(x, 16),
              "app_domain": lambda x: int(x, 16),
              "app_category": lambda x: int(x, 16),
              "device_id": lambda x: int(x, 16),
              "device_model": lambda x: int(x, 16),
              "device_type": lambda x: int(x, 16),
              "device_ip": lambda x: int(x, 16),
             }

clickcsvpath = location+clickfilename+'.csv'
clickpicklepath = location+clickfilename+'.pkl'
userdatapath = location+userdatafilename+'.pkl'

# read the full dataset, saved as pickled dataframe
try:
    print('reading original pickled data...')
    with open(clickpicklepath, 'rb') as handle:
        data = pickle.load(handle)

except:
    # couldn't read the pickled version, so read the csv
    print('error: reading original csv file')
    #Import csv file, converting hex vals to decimal
    data=pd.read_csv(clickcsvpath, converters=converters) 
    # save data as pickle
    with open(clickpicklepath, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
# try to read the pickled user grouped data file version
try:
    print('reading pickled user grouped data...')
    with open(userdatapath, 'rb') as handle:
        userdata = pickle.load(handle)

# no user data file, so read original data and create user data
except:      
    print("error: couldn't read pickled user grouped data, create that")
    
    # group data by device_IP, get count of each unique device IP
    # userdata = data.groupby('device_ip').count().reset_index()
    userdata = data.groupby('device_ip').size().reset_index(name='counts')
    
    print('userdata.shape=',userdata.shape)

    # save data
    with open(userdatapath, 'wb') as handle:
        pickle.dump(userdata, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
print('done getting data')


reading original pickled data...
reading pickled user grouped data...
done getting data


In [5]:
# temp data save path
tempdatapath = location+'tempdata.pkl'

# read the last saved temp file, if there was one
try:
    print('reading temp data...')
    with open(tempdatapath, 'rb') as handle:
        repeats,starti = pickle.load(handle)

# no user data file, so create new user count data and start from scratch
except:
    print("can't find temp file, starting at beginning")
    repeats = pd.Series(np.zeros([data.shape[0],]))
    starti = 0

# for each device IP (subst for user ID), 
#   fill all data rows with this IP with the click count for this IP
for ip,count,i in zip(userdata.device_ip[starti:],
                      userdata.counts[starti:],
                      range(starti,userdata.shape[0])):
    repeats.loc[data['device_ip']==ip] = count
    
    # periodically save the results to temp file
    if i and i % 1000 == 0:
        numchanged = (repeats>0).sum() 
        print("%d/%d: %d/%d changed = %2.3f %%"%(
            i,userdata.shape[0],
            numchanged,repeats.shape[0],
            100*numchanged/repeats.shape[0]) )
        # save data
        with open(tempdatapath, 'wb') as handle:
            pickle.dump((repeats,i), handle, protocol=pickle.HIGHEST_PROTOCOL)
        handle.close()

# save data
with open(tempdatapath, 'wb') as handle:
    pickle.dump((repeats,i), handle, protocol=pickle.HIGHEST_PROTOCOL)
handle.close()


reading temp data...
3907000/6729486: 23482643/40428967 changed = 58.084 %
3908000/6729486: 23489322/40428967 changed = 58.100 %
3909000/6729486: 23494219/40428967 changed = 58.112 %
3910000/6729486: 23498927/40428967 changed = 58.124 %
3911000/6729486: 23503604/40428967 changed = 58.136 %
3912000/6729486: 23509267/40428967 changed = 58.150 %
3913000/6729486: 23514435/40428967 changed = 58.162 %
3914000/6729486: 23519385/40428967 changed = 58.175 %
3915000/6729486: 23524898/40428967 changed = 58.188 %
3916000/6729486: 23529826/40428967 changed = 58.200 %
3917000/6729486: 23535650/40428967 changed = 58.215 %
3918000/6729486: 23540066/40428967 changed = 58.226 %
3919000/6729486: 23546665/40428967 changed = 58.242 %
3920000/6729486: 23552144/40428967 changed = 58.256 %
3921000/6729486: 23559539/40428967 changed = 58.274 %
3922000/6729486: 23566093/40428967 changed = 58.290 %
3923000/6729486: 23570807/40428967 changed = 58.302 %
3924000/6729486: 23575543/40428967 changed = 58.313 %
3925000

4059000/6729486: 24372434/40428967 changed = 60.285 %
4060000/6729486: 24380161/40428967 changed = 60.304 %
4061000/6729486: 24386531/40428967 changed = 60.319 %
4062000/6729486: 24393617/40428967 changed = 60.337 %
4063000/6729486: 24399291/40428967 changed = 60.351 %
4064000/6729486: 24404377/40428967 changed = 60.364 %
4065000/6729486: 24409398/40428967 changed = 60.376 %
4066000/6729486: 24414661/40428967 changed = 60.389 %
4067000/6729486: 24422330/40428967 changed = 60.408 %
4068000/6729486: 24435524/40428967 changed = 60.441 %
4069000/6729486: 24440206/40428967 changed = 60.452 %
4070000/6729486: 24445095/40428967 changed = 60.464 %
4071000/6729486: 24449466/40428967 changed = 60.475 %
4072000/6729486: 24454137/40428967 changed = 60.487 %
4073000/6729486: 24460495/40428967 changed = 60.502 %
4074000/6729486: 24465262/40428967 changed = 60.514 %
4075000/6729486: 24470013/40428967 changed = 60.526 %
4076000/6729486: 24477642/40428967 changed = 60.545 %
4077000/6729486: 24482531/40

4211000/6729486: 25299376/40428967 changed = 62.577 %
4212000/6729486: 25306082/40428967 changed = 62.594 %
4213000/6729486: 25310128/40428967 changed = 62.604 %
4214000/6729486: 25315220/40428967 changed = 62.617 %
4215000/6729486: 25320745/40428967 changed = 62.630 %
4216000/6729486: 25325796/40428967 changed = 62.643 %
4217000/6729486: 25330684/40428967 changed = 62.655 %
4218000/6729486: 25337068/40428967 changed = 62.671 %
4219000/6729486: 25342104/40428967 changed = 62.683 %
4220000/6729486: 25348200/40428967 changed = 62.698 %
4221000/6729486: 25353691/40428967 changed = 62.712 %
4222000/6729486: 25362450/40428967 changed = 62.733 %
4223000/6729486: 25369267/40428967 changed = 62.750 %
4224000/6729486: 25373185/40428967 changed = 62.760 %
4225000/6729486: 25379959/40428967 changed = 62.777 %
4226000/6729486: 25384554/40428967 changed = 62.788 %
4227000/6729486: 25389754/40428967 changed = 62.801 %
4228000/6729486: 25396264/40428967 changed = 62.817 %
4229000/6729486: 25401503/40

4363000/6729486: 26165351/40428967 changed = 64.719 %
4364000/6729486: 26171733/40428967 changed = 64.735 %
4365000/6729486: 26178728/40428967 changed = 64.752 %
4366000/6729486: 26182384/40428967 changed = 64.761 %
4367000/6729486: 26187205/40428967 changed = 64.773 %
4368000/6729486: 26192069/40428967 changed = 64.785 %
4369000/6729486: 26196277/40428967 changed = 64.796 %
4370000/6729486: 26201334/40428967 changed = 64.808 %
4371000/6729486: 26206849/40428967 changed = 64.822 %
4372000/6729486: 26212812/40428967 changed = 64.837 %
4373000/6729486: 26217568/40428967 changed = 64.848 %
4374000/6729486: 26224554/40428967 changed = 64.866 %
4375000/6729486: 26229789/40428967 changed = 64.879 %
4376000/6729486: 26234086/40428967 changed = 64.889 %
4377000/6729486: 26239171/40428967 changed = 64.902 %
4378000/6729486: 26243942/40428967 changed = 64.914 %
4379000/6729486: 26248638/40428967 changed = 64.925 %
4380000/6729486: 26252970/40428967 changed = 64.936 %
4381000/6729486: 26260208/40

4515000/6729486: 27136399/40428967 changed = 67.121 %
4516000/6729486: 27141691/40428967 changed = 67.134 %
4517000/6729486: 27148078/40428967 changed = 67.150 %
4518000/6729486: 27154403/40428967 changed = 67.166 %
4519000/6729486: 27159637/40428967 changed = 67.179 %
4520000/6729486: 27166331/40428967 changed = 67.195 %
4521000/6729486: 27171722/40428967 changed = 67.209 %
4522000/6729486: 27175776/40428967 changed = 67.219 %
4523000/6729486: 27180330/40428967 changed = 67.230 %
4524000/6729486: 27186181/40428967 changed = 67.244 %
4525000/6729486: 27191387/40428967 changed = 67.257 %
4526000/6729486: 27196663/40428967 changed = 67.270 %
4527000/6729486: 27201303/40428967 changed = 67.282 %
4528000/6729486: 27205854/40428967 changed = 67.293 %
4529000/6729486: 27211792/40428967 changed = 67.308 %
4530000/6729486: 27221756/40428967 changed = 67.332 %
4531000/6729486: 27228910/40428967 changed = 67.350 %
4532000/6729486: 27234061/40428967 changed = 67.363 %
4533000/6729486: 27239125/40

4667000/6729486: 28250194/40428967 changed = 69.876 %
4668000/6729486: 28255771/40428967 changed = 69.890 %
4669000/6729486: 28263405/40428967 changed = 69.909 %
4670000/6729486: 28268534/40428967 changed = 69.921 %
4671000/6729486: 28274005/40428967 changed = 69.935 %
4672000/6729486: 28279536/40428967 changed = 69.949 %
4673000/6729486: 28284660/40428967 changed = 69.961 %
4674000/6729486: 28289359/40428967 changed = 69.973 %
4675000/6729486: 28294010/40428967 changed = 69.984 %
4676000/6729486: 28299327/40428967 changed = 69.998 %
4677000/6729486: 28308383/40428967 changed = 70.020 %
4678000/6729486: 28312984/40428967 changed = 70.031 %
4679000/6729486: 28317937/40428967 changed = 70.044 %
4680000/6729486: 28322586/40428967 changed = 70.055 %
4681000/6729486: 28329010/40428967 changed = 70.071 %
4682000/6729486: 28332986/40428967 changed = 70.081 %
4683000/6729486: 28341411/40428967 changed = 70.102 %
4684000/6729486: 28348253/40428967 changed = 70.119 %
4685000/6729486: 28352486/40

4819000/6729486: 29107092/40428967 changed = 71.996 %
4820000/6729486: 29111600/40428967 changed = 72.007 %
4821000/6729486: 29118091/40428967 changed = 72.023 %
4822000/6729486: 29122423/40428967 changed = 72.034 %
4823000/6729486: 29127094/40428967 changed = 72.045 %
4824000/6729486: 29131919/40428967 changed = 72.057 %
4825000/6729486: 29137477/40428967 changed = 72.071 %
4826000/6729486: 29143232/40428967 changed = 72.085 %
4827000/6729486: 29147448/40428967 changed = 72.095 %
4828000/6729486: 29154097/40428967 changed = 72.112 %
4829000/6729486: 29167849/40428967 changed = 72.146 %
4830000/6729486: 29173244/40428967 changed = 72.159 %
4831000/6729486: 29179207/40428967 changed = 72.174 %
4832000/6729486: 29183466/40428967 changed = 72.185 %
4833000/6729486: 29188539/40428967 changed = 72.197 %
4834000/6729486: 29193910/40428967 changed = 72.210 %
4835000/6729486: 29199122/40428967 changed = 72.223 %
4836000/6729486: 29203791/40428967 changed = 72.235 %
4837000/6729486: 29210128/40

4971000/6729486: 29987110/40428967 changed = 74.172 %
4972000/6729486: 29993094/40428967 changed = 74.187 %
4973000/6729486: 30001025/40428967 changed = 74.207 %
4974000/6729486: 30005323/40428967 changed = 74.217 %
4975000/6729486: 30011699/40428967 changed = 74.233 %
4976000/6729486: 30017522/40428967 changed = 74.248 %
4977000/6729486: 30023670/40428967 changed = 74.263 %
4978000/6729486: 30029968/40428967 changed = 74.278 %
4979000/6729486: 30035874/40428967 changed = 74.293 %
4980000/6729486: 30040948/40428967 changed = 74.306 %
4981000/6729486: 30045530/40428967 changed = 74.317 %
4982000/6729486: 30064156/40428967 changed = 74.363 %
4983000/6729486: 30073991/40428967 changed = 74.387 %
4984000/6729486: 30082563/40428967 changed = 74.408 %
4985000/6729486: 30088134/40428967 changed = 74.422 %
4986000/6729486: 30093161/40428967 changed = 74.435 %
4987000/6729486: 30098600/40428967 changed = 74.448 %
4988000/6729486: 30105146/40428967 changed = 74.464 %
4989000/6729486: 30109896/40

5123000/6729486: 30861965/40428967 changed = 76.336 %
5124000/6729486: 30868042/40428967 changed = 76.351 %
5125000/6729486: 30872863/40428967 changed = 76.363 %
5126000/6729486: 30877048/40428967 changed = 76.374 %
5127000/6729486: 30885175/40428967 changed = 76.394 %
5128000/6729486: 30889984/40428967 changed = 76.406 %
5129000/6729486: 30895604/40428967 changed = 76.419 %
5130000/6729486: 30900461/40428967 changed = 76.431 %
5131000/6729486: 30904916/40428967 changed = 76.443 %
5132000/6729486: 30909877/40428967 changed = 76.455 %
5133000/6729486: 30914686/40428967 changed = 76.467 %
5134000/6729486: 30919251/40428967 changed = 76.478 %
5135000/6729486: 30925794/40428967 changed = 76.494 %
5136000/6729486: 30931431/40428967 changed = 76.508 %
5137000/6729486: 30936799/40428967 changed = 76.521 %
5138000/6729486: 30941313/40428967 changed = 76.533 %
5139000/6729486: 30945113/40428967 changed = 76.542 %
5140000/6729486: 30950908/40428967 changed = 76.556 %
5141000/6729486: 30954543/40

5275000/6729486: 31837831/40428967 changed = 78.750 %
5276000/6729486: 31842749/40428967 changed = 78.762 %
5277000/6729486: 31847600/40428967 changed = 78.774 %
5278000/6729486: 31851701/40428967 changed = 78.784 %
5279000/6729486: 31857114/40428967 changed = 78.798 %
5280000/6729486: 31861218/40428967 changed = 78.808 %
5281000/6729486: 31865490/40428967 changed = 78.818 %
5282000/6729486: 31870248/40428967 changed = 78.830 %
5283000/6729486: 31875036/40428967 changed = 78.842 %
5284000/6729486: 31880322/40428967 changed = 78.855 %
5285000/6729486: 31885057/40428967 changed = 78.867 %
5286000/6729486: 31890484/40428967 changed = 78.880 %
5287000/6729486: 31896373/40428967 changed = 78.895 %
5288000/6729486: 31904345/40428967 changed = 78.915 %
5289000/6729486: 31907957/40428967 changed = 78.924 %
5290000/6729486: 31916817/40428967 changed = 78.945 %
5291000/6729486: 31924820/40428967 changed = 78.965 %
5292000/6729486: 31929141/40428967 changed = 78.976 %
5293000/6729486: 31932803/40

5427000/6729486: 32668211/40428967 changed = 80.804 %
5428000/6729486: 32673413/40428967 changed = 80.817 %
5429000/6729486: 32679876/40428967 changed = 80.833 %
5430000/6729486: 32684935/40428967 changed = 80.845 %
5431000/6729486: 32689785/40428967 changed = 80.857 %
5432000/6729486: 32695022/40428967 changed = 80.870 %
5433000/6729486: 32700337/40428967 changed = 80.883 %
5434000/6729486: 32705553/40428967 changed = 80.896 %
5435000/6729486: 32710607/40428967 changed = 80.909 %
5436000/6729486: 32716681/40428967 changed = 80.924 %
5437000/6729486: 32720725/40428967 changed = 80.934 %
5438000/6729486: 32726361/40428967 changed = 80.948 %
5439000/6729486: 32730247/40428967 changed = 80.957 %
5440000/6729486: 32734501/40428967 changed = 80.968 %
5441000/6729486: 32740061/40428967 changed = 80.982 %
5442000/6729486: 32745621/40428967 changed = 80.995 %
5443000/6729486: 32758717/40428967 changed = 81.028 %
5444000/6729486: 32832596/40428967 changed = 81.211 %
5445000/6729486: 32838505/40

5579000/6729486: 33641466/40428967 changed = 83.211 %
5580000/6729486: 33647665/40428967 changed = 83.227 %
5581000/6729486: 33655801/40428967 changed = 83.247 %
5582000/6729486: 33660728/40428967 changed = 83.259 %
5583000/6729486: 33666931/40428967 changed = 83.274 %
5584000/6729486: 33671727/40428967 changed = 83.286 %
5585000/6729486: 33675825/40428967 changed = 83.296 %
5586000/6729486: 33680386/40428967 changed = 83.308 %
5587000/6729486: 33686400/40428967 changed = 83.322 %
5588000/6729486: 33690291/40428967 changed = 83.332 %
5589000/6729486: 33696749/40428967 changed = 83.348 %
5590000/6729486: 33708270/40428967 changed = 83.377 %
5591000/6729486: 33713601/40428967 changed = 83.390 %
5592000/6729486: 33720410/40428967 changed = 83.407 %
5593000/6729486: 33725346/40428967 changed = 83.419 %
5594000/6729486: 33730441/40428967 changed = 83.431 %
5595000/6729486: 33736569/40428967 changed = 83.447 %
5596000/6729486: 33740997/40428967 changed = 83.457 %
5597000/6729486: 33745079/40

5731000/6729486: 34582244/40428967 changed = 85.538 %
5732000/6729486: 34587186/40428967 changed = 85.551 %
5733000/6729486: 34593477/40428967 changed = 85.566 %
5734000/6729486: 34599061/40428967 changed = 85.580 %
5735000/6729486: 34604441/40428967 changed = 85.593 %
5736000/6729486: 34610040/40428967 changed = 85.607 %
5737000/6729486: 34615121/40428967 changed = 85.620 %
5738000/6729486: 34620670/40428967 changed = 85.633 %
5739000/6729486: 34626740/40428967 changed = 85.648 %
5740000/6729486: 34631733/40428967 changed = 85.661 %
5741000/6729486: 34638627/40428967 changed = 85.678 %
5742000/6729486: 34643259/40428967 changed = 85.689 %
5743000/6729486: 34648901/40428967 changed = 85.703 %
5744000/6729486: 34653615/40428967 changed = 85.715 %
5745000/6729486: 34658381/40428967 changed = 85.727 %
5746000/6729486: 34663157/40428967 changed = 85.738 %
5747000/6729486: 34667404/40428967 changed = 85.749 %
5748000/6729486: 34673351/40428967 changed = 85.764 %
5749000/6729486: 34678444/40

5883000/6729486: 35497910/40428967 changed = 87.803 %
5884000/6729486: 35502086/40428967 changed = 87.813 %
5885000/6729486: 35510902/40428967 changed = 87.835 %
5886000/6729486: 35515892/40428967 changed = 87.848 %
5887000/6729486: 35520112/40428967 changed = 87.858 %
5888000/6729486: 35525378/40428967 changed = 87.871 %
5889000/6729486: 35535584/40428967 changed = 87.896 %
5890000/6729486: 35555613/40428967 changed = 87.946 %
5891000/6729486: 35560971/40428967 changed = 87.959 %
5892000/6729486: 35567182/40428967 changed = 87.975 %
5893000/6729486: 35570915/40428967 changed = 87.984 %
5894000/6729486: 35575624/40428967 changed = 87.995 %
5895000/6729486: 35580630/40428967 changed = 88.008 %
5896000/6729486: 35584570/40428967 changed = 88.018 %
5897000/6729486: 35589502/40428967 changed = 88.030 %
5898000/6729486: 35600862/40428967 changed = 88.058 %
5899000/6729486: 35605581/40428967 changed = 88.069 %
5900000/6729486: 35609749/40428967 changed = 88.080 %
5901000/6729486: 35614447/40

6035000/6729486: 36433801/40428967 changed = 90.118 %
6036000/6729486: 36437951/40428967 changed = 90.128 %
6037000/6729486: 36442837/40428967 changed = 90.140 %
6038000/6729486: 36448457/40428967 changed = 90.154 %
6039000/6729486: 36453173/40428967 changed = 90.166 %
6040000/6729486: 36457292/40428967 changed = 90.176 %
6041000/6729486: 36463060/40428967 changed = 90.190 %
6042000/6729486: 36468541/40428967 changed = 90.204 %
6043000/6729486: 36472735/40428967 changed = 90.214 %
6044000/6729486: 36482265/40428967 changed = 90.238 %
6045000/6729486: 36487933/40428967 changed = 90.252 %
6046000/6729486: 36494525/40428967 changed = 90.268 %
6047000/6729486: 36499973/40428967 changed = 90.282 %
6048000/6729486: 36508136/40428967 changed = 90.302 %
6049000/6729486: 36517906/40428967 changed = 90.326 %
6050000/6729486: 36523047/40428967 changed = 90.339 %
6051000/6729486: 36527517/40428967 changed = 90.350 %
6052000/6729486: 36532290/40428967 changed = 90.362 %
6053000/6729486: 36536940/40

6187000/6729486: 37324322/40428967 changed = 92.321 %
6188000/6729486: 37332462/40428967 changed = 92.341 %
6189000/6729486: 37340850/40428967 changed = 92.362 %
6190000/6729486: 37346484/40428967 changed = 92.376 %
6191000/6729486: 37351298/40428967 changed = 92.387 %
6192000/6729486: 37358501/40428967 changed = 92.405 %
6193000/6729486: 37363089/40428967 changed = 92.417 %
6194000/6729486: 37367844/40428967 changed = 92.428 %
6195000/6729486: 37373979/40428967 changed = 92.444 %
6196000/6729486: 37380119/40428967 changed = 92.459 %
6197000/6729486: 37386354/40428967 changed = 92.474 %
6198000/6729486: 37390929/40428967 changed = 92.485 %
6199000/6729486: 37397712/40428967 changed = 92.502 %
6200000/6729486: 37404308/40428967 changed = 92.519 %
6201000/6729486: 37411255/40428967 changed = 92.536 %
6202000/6729486: 37415429/40428967 changed = 92.546 %
6203000/6729486: 37420442/40428967 changed = 92.558 %
6204000/6729486: 37425105/40428967 changed = 92.570 %
6205000/6729486: 37430881/40

6339000/6729486: 38170212/40428967 changed = 94.413 %
6340000/6729486: 38175598/40428967 changed = 94.426 %
6341000/6729486: 38180578/40428967 changed = 94.439 %
6342000/6729486: 38186389/40428967 changed = 94.453 %
6343000/6729486: 38190909/40428967 changed = 94.464 %
6344000/6729486: 38199061/40428967 changed = 94.484 %
6345000/6729486: 38206614/40428967 changed = 94.503 %
6346000/6729486: 38210383/40428967 changed = 94.512 %
6347000/6729486: 38215752/40428967 changed = 94.526 %
6348000/6729486: 38221305/40428967 changed = 94.539 %
6349000/6729486: 38233565/40428967 changed = 94.570 %
6350000/6729486: 38241724/40428967 changed = 94.590 %
6351000/6729486: 38248957/40428967 changed = 94.608 %
6352000/6729486: 38253343/40428967 changed = 94.619 %
6353000/6729486: 38259819/40428967 changed = 94.635 %
6354000/6729486: 38264697/40428967 changed = 94.647 %
6355000/6729486: 38273232/40428967 changed = 94.668 %
6356000/6729486: 38279185/40428967 changed = 94.683 %
6357000/6729486: 38284428/40

6491000/6729486: 39036828/40428967 changed = 96.557 %
6492000/6729486: 39042388/40428967 changed = 96.570 %
6493000/6729486: 39046843/40428967 changed = 96.581 %
6494000/6729486: 39051017/40428967 changed = 96.592 %
6495000/6729486: 39057145/40428967 changed = 96.607 %
6496000/6729486: 39065176/40428967 changed = 96.627 %
6497000/6729486: 39070230/40428967 changed = 96.639 %
6498000/6729486: 39081225/40428967 changed = 96.666 %
6499000/6729486: 39085781/40428967 changed = 96.678 %
6500000/6729486: 39090798/40428967 changed = 96.690 %
6501000/6729486: 39094636/40428967 changed = 96.700 %
6502000/6729486: 39100285/40428967 changed = 96.714 %
6503000/6729486: 39107006/40428967 changed = 96.730 %
6504000/6729486: 39111325/40428967 changed = 96.741 %
6505000/6729486: 39115574/40428967 changed = 96.751 %
6506000/6729486: 39121007/40428967 changed = 96.765 %
6507000/6729486: 39143337/40428967 changed = 96.820 %
6508000/6729486: 39148114/40428967 changed = 96.832 %
6509000/6729486: 39154812/40

6643000/6729486: 39911183/40428967 changed = 98.719 %
6644000/6729486: 39915628/40428967 changed = 98.730 %
6645000/6729486: 39922848/40428967 changed = 98.748 %
6646000/6729486: 39927599/40428967 changed = 98.760 %
6647000/6729486: 39932224/40428967 changed = 98.771 %
6648000/6729486: 39938104/40428967 changed = 98.786 %
6649000/6729486: 39943675/40428967 changed = 98.800 %
6650000/6729486: 39948010/40428967 changed = 98.810 %
6651000/6729486: 39953151/40428967 changed = 98.823 %
6652000/6729486: 39958877/40428967 changed = 98.837 %
6653000/6729486: 39965370/40428967 changed = 98.853 %
6654000/6729486: 39970235/40428967 changed = 98.865 %
6655000/6729486: 39978071/40428967 changed = 98.885 %
6656000/6729486: 39983856/40428967 changed = 98.899 %
6657000/6729486: 39988379/40428967 changed = 98.910 %
6658000/6729486: 39994632/40428967 changed = 98.926 %
6659000/6729486: 40000609/40428967 changed = 98.940 %
6660000/6729486: 40005786/40428967 changed = 98.953 %
6661000/6729486: 40013565/40

In [9]:
# create new path for modified data
clickpicklepath_rpt = location+clickfilename+'_repeatcount.pkl'

# create new feature column
data['userclickcount'] = repeats

# save data as pickle
with open(clickpicklepath_rpt, 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()

data.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,userclickcount
0,1.000009e+18,0,14102100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,131587874,2845778250,3721564782,1150642724,1,2,15706,320,50,1722,0,35,-1,79,69816.0
1,1.000017e+19,0,14102100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,131587874,2845778250,2525010632,1897849120,1,0,15704,320,50,1722,0,35,100084,79,7.0
2,1.000037e+19,0,14102100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,131587874,2845778250,3016723951,2320004541,1,0,15704,320,50,1722,0,35,100084,79,4.0
3,1.000064e+19,0,14102100,1005,0,532546046,4085536615,680550077,3970769798,2013391065,131587874,2845778250,3894893455,1664238106,1,0,15706,320,50,1722,0,35,100084,79,48.0
4,1.000068e+19,0,14102100,1005,1,4270638152,2439430497,90831144,3970769798,2013391065,131587874,2845778250,2521092287,2006814914,1,0,18993,320,50,2161,0,35,-1,157,31.0
