### Segregating unique identifiers like GPID and IDFA, storing them in chunks of 10Mb size and posting them 

#### -by Prateek Agrawal

In [13]:
# libraries imported useful for the analysis
import pandas as pd
import numpy as np
import sys
import os
import requests

#### Functions used in Part 1:--

In [2]:
# Building a function to find the type of identifier
def test_GPID_IDFA(string):
    condition1 = 'FALSE'
    condition2 = 'FALSE'
    value = "others"
    for c in list(string):
        if c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
            condition1 = 'TRUE'
            break
        if c in 'abcdefghijklmnopqrstuvwxyz':
            condition2 = 'TRUE'
            break
    if (condition1 == 'TRUE' and condition2 == "FALSE"):
        value = 'IDFA'
    if (condition1 == 'FALSE' and condition2 == "TRUE"):
        value = 'GPID'
    return(value)

#### Note: 
For 10Mb storage, there are number of factors that affect the size of a file, eg the number of columns, size and type of data stored in the column, whether computer used has a 64bit or 32 bit processor, taking all such factors into account, we calculate the size of the bits by trail and error method for the current data format.

In [78]:
# function to find the index of the dataframe at which the subset should be done so that the files created will be 
# of size 10Mb on the disk

def find_index(df):
    max_size = 36097846                        # number of bits required
    curr_size = 0
    index = 0                                  # the index number
    for i in range(0,len(df),100):
        curr_size = sys.getsizeof(df.ix[0:i,])
        if curr_size >= max_size:
            index = i-100
            break
    return(index)

In [29]:
# function to store file containing GPID id in local storage
def saving_GPID_10mb(index):
    
    # dividing in the data in chunks as per the size
    chunks = [GPID_df.ix[x:x+index,] for x in range(0, len(GPID_df), index)]

    # creating a new directory
    os.makedirs("GPID")
    current_dir = os.getcwd() # storing the current directory
    os.chdir(os.path.join(current_dir, "GPID")) # moving the directory 

    # storing the GPID files
    for i, df in enumerate(chunks):
        filename = "GPID_" + str(i) +'.csv'
        df.to_csv(filename, index=False)
    
    os.chdir(current_dir) # returning to the original directory
    return("files created")
    

In [30]:
# function to store file containing IDFA id in local storage
def saving_IDFA_10mb(index):
    
    # dividing in the data in chunks as per the size
    chunks = [IDFA_df.ix[x:x+index,] for x in range(0, len(IDFA_df), index)]

    # creating a new directory
    os.makedirs("IDFA")
    current_dir = os.getcwd() # storing the current directory
    os.chdir(os.path.join(current_dir, "IDFA")) # moving the directory 

    # storing the GPID files
    for i, df in enumerate(chunks):
        filename = "IDFA_" + str(i) +'.csv'
        df.to_csv(filename, index=False)

    os.chdir(current_dir) # Returning to the original directory
    return("files created")

### Part 1:

In [6]:
# reading the data file and renaming the column name
inmobi_data = pd.read_csv("tpr_inactive_maids.txt", header=None)
inmobi_data.columns = ['Identifiers']

In [7]:
# viewing the data
inmobi_data.head()

Unnamed: 0,Identifiers
0,000236f4-cbf7-47d6-95cd-d175c2f5b9b6
1,00024E74-4BE2-471E-A8FB-7D5AEB5B1B74
2,00037512-0f03-4a61-aeff-811f5c97a823
3,0003A812-6237-439F-8F99-9C5EEB27CCEC
4,0003B41E-0882-4A21-A933-76859FED095D


In [8]:
# checking the dimensions of the data provided
inmobi_data.shape

(22363980, 1)

In [9]:
# finding the identifier
inmobi_data['Type_of_Value'] = inmobi_data.Identifiers.map(lambda x: test_GPID_IDFA(x))

In [10]:
# checking the number of identifiers
print("Number of GPID Identifiers:" , inmobi_data.ix[inmobi_data.Type_of_Value == "GPID",].shape[0])
print("Number of IDFA Identifiers:" , inmobi_data.ix[inmobi_data.Type_of_Value == "IDFA",].shape[0])
print("Number of 'other' Identifiers:" , inmobi_data.ix[inmobi_data.Type_of_Value == "others",].shape[0])
print("Total number of Identifiers:", inmobi_data.shape[0])

Number of GPID Identifiers: 6327153
Number of IDFA Identifiers: 16036818
Number of 'other' Identifiers: 9
Total number of Identifiers: 22363980


In [11]:
# separating the identifier and creating separate list for each
GPID_identifiers =  inmobi_data.Identifiers[inmobi_data.Type_of_Value == "GPID"]
IDFA_identifiers =  inmobi_data.Identifiers[inmobi_data.Type_of_Value == "IDFA"]
other_identifiers =  inmobi_data.Identifiers[inmobi_data.Type_of_Value == "others"]

In [12]:
# resetting the index of the dataframe
GPID_identifiers = GPID_identifiers.reset_index(range(len(GPID_identifiers)), drop=True)
IDFA_identifiers = IDFA_identifiers.reset_index(range(len(IDFA_identifiers)), drop=True)
other_identifiers = other_identifiers.reset_index(range(len(other_identifiers)), drop=True)

In [109]:
# convert the list in dataframe with desired columns..

# GPID
GPID_df = pd.DataFrame(columns=['eventTime', 'propertyId','o1','um5', 'ida', 'idv', 'gpId'])
GPID_df.gpId = GPID_identifiers
GPID_df.propertyId = ['59188fbcceb9464989aec3f03d5f29f2'] * len(GPID_df)
GPID_df.um5 = ['202cb962ac59075b964b07152d234b70'] * len(GPID_df)
GPID_df.o1 = ['40bd001563085fc35165329ea1ff5c5ecbdbbeef'] * len(GPID_df)
GPID_df = GPID_df.replace(np.nan,' ', regex=True)

# IDFA
IDFA_df = pd.DataFrame(columns=['eventTime', 'propertyId','o1','um5', 'ida', 'idv', 'gpId'])
IDFA_df.gpId = IDFA_identifiers
IDFA_df.propertyId = ['b8c093a2cd8f4bc4938796115ef40c09'] * len(IDFA_df)
IDFA_df.um5 = ['202cb962ac59075b964b07152d234b70'] * len(IDFA_df)
IDFA_df.o1 = ['40bd001563085fc35165329ea1ff5c5ecbdbbeef'] * len(IDFA_df)
IDFA_df = IDFA_df.replace(np.nan,' ', regex=True)


As both GPID and IDFA identifier are stored in the same format their the index at which they should be subsetted so the size of the stored file is 10Mb will be the same.

In [110]:
# finding the index where the data needs to be subsetted for maintaining 10Mb
index = find_index(GPID_df)
print(index)

66500


In [111]:
# Creating a folder and saving the file of size 10Mb
saving_GPID_10mb(index)
saving_IDFA_10mb(index)

'files created'

### Part 2:

In [112]:
# list of files in the folder
files_inIDFA = os.listdir(os.path.join(os.getcwd(), "IDFA"))
files_inGPID = os.listdir(os.path.join(os.getcwd(), "GPID"))

### Posting each file from GPID and IDFA, and checking the status after posting it.

In [108]:
# url to be used
url = 'http://advertiser-content.inmobiapis.com/tpce/v1/upload/events/download'
current_dir = os.getcwd()                   # storing the current directory
os.chdir(os.path.join(os.getcwd(), "GPID")) # changing the current directory
for name in files_inGPID:
    print(name)                             # printing the file name
    files = {'file': open(name, 'rb')}  
    r = requests.post(url, files=files)     # posting the request
    print(r.text)                           # checking the status

os.chdir(current_dir)                       # coming back to previous directory

GPID_0.csv
{"status":"OK","message":"check the jobStatus via GET call of given jobUrl","code":200,"jobUrl":"advertiser-content.inmobiapis.com/tpce/v1/upload/events/download/jobs/file?job_id=GPID_0.csv_j5ca88003-efab-4bee-957b-46a3295e6962"}
GPID_1.csv
{"status":"OK","message":"check the jobStatus via GET call of given jobUrl","code":200,"jobUrl":"advertiser-content.inmobiapis.com/tpce/v1/upload/events/download/jobs/file?job_id=GPID_1.csv_j57505f9f-5382-44bf-832e-01bc80176b5a"}
GPID_10.csv
{"status":"OK","message":"check the jobStatus via GET call of given jobUrl","code":200,"jobUrl":"advertiser-content.inmobiapis.com/tpce/v1/upload/events/download/jobs/file?job_id=GPID_10.csv_jc34c8d62-6a6b-4228-876a-b69fd0241aeb"}
GPID_11.csv
{"status":"OK","message":"check the jobStatus via GET call of given jobUrl","code":200,"jobUrl":"advertiser-content.inmobiapis.com/tpce/v1/upload/events/download/jobs/file?job_id=GPID_11.csv_je66a52f7-3b77-4fee-b100-f5173fdff4de"}
GPID_12.csv
{"status":"OK","mess

In [113]:
# url to be used
url = 'http://advertiser-content.inmobiapis.com/tpce/v1/upload/events/download'
current_dir = os.getcwd()                   # storing the current directory
os.chdir(os.path.join(os.getcwd(), "IDFA")) # changing the current directory
for name in files_inIDFA:
    print(name)                             # printing the file name
    files = {'file': open(name, 'rb')}  
    r = requests.post(url, files=files)     # posting the request
    print(r.text)                           # checking the status

os.chdir(current_dir)                       # coming back to previous directory


IDFA_0.csv
{"status":"OK","message":"check the jobStatus via GET call of given jobUrl","code":200,"jobUrl":"advertiser-content.inmobiapis.com/tpce/v1/upload/events/download/jobs/file?job_id=IDFA_0.csv_jf573c4fa-cb05-4554-8ae7-f1ca5cb9f199"}
IDFA_1.csv
{"status":"OK","message":"check the jobStatus via GET call of given jobUrl","code":200,"jobUrl":"advertiser-content.inmobiapis.com/tpce/v1/upload/events/download/jobs/file?job_id=IDFA_1.csv_j885e8f76-a08f-4824-9120-64219a896b42"}
IDFA_10.csv
{"status":"OK","message":"check the jobStatus via GET call of given jobUrl","code":200,"jobUrl":"advertiser-content.inmobiapis.com/tpce/v1/upload/events/download/jobs/file?job_id=IDFA_10.csv_j55a81011-fd50-4527-9c2c-1e41c6194560"}
IDFA_100.csv
{"status":"OK","message":"check the jobStatus via GET call of given jobUrl","code":200,"jobUrl":"advertiser-content.inmobiapis.com/tpce/v1/upload/events/download/jobs/file?job_id=IDFA_100.csv_j825e04c1-7f14-4abe-9644-f26f2f4d8a64"}
IDFA_101.csv
{"status":"OK","m

In [None]:
# multi part posting code is reference from http://docs.python-requests.org/en/latest/user/quickstart/#post-a-multipart-encoded-file

In [None]:
# ---------------------------------------------- Rough Work -----------------------------------------------------

In [107]:
#os.chdir('/Users/DataFlash/Documents/Inmobi Data Challenge')
#os.getcwd()

'/Users/DataFlash/Documents/Inmobi Data Challenge'

In [93]:
# Posting each of the csv files
# url = 'http://advertiser-content.inmobiapis.com/tpce/v1/upload/events/download'
# files = {'file': open('GPID.csv', 'rb')}

# posting the request
# r = requests.post(url, files=files)
# checking the status
# r.text

'{"status":"OK","message":"check the jobStatus via GET call of given jobUrl","code":200,"jobUrl":"advertiser-content.inmobiapis.com/tpce/v1/upload/events/download/jobs/file?job_id=GPID.csv_j805c4538-8810-4ff5-a2c1-93d0c2cad49f"}'

In [81]:
#sys.getsizeof(GPID_df.ix[0:66600,])

36043646

In [88]:
#GPID_df.ix[0:66500,].to_csv("GPID1" ,index=False)

In [89]:
#r = requests.post('http://advertiser-content.inmobiapis.com/tpce/v1/upload/events/download', 
#                  files={'GPID1.zip': open('GPID1.zip', 'rb')})

In [94]:
#r = requests.post('http://advertiser-content.inmobiapis.com/tpce/v1/upload/events/download', 
#                  files=dict(GPID.csv))

In [95]:
#sys.getsizeof(GPID_df.ix[0:125000,])

In [316]:
#chunks = [GPID_identifiers[x:x+index] for x in range(0, len(GPID_identifiers), index)]

In [None]:
# function to find the 10mb file
# def create_10mb(df_GPID):
#     max_size = 21370574
#     os.makedirs(df.columns[0])
#     current_dir = os.getcwd()
#     os.chdir(os.path.join(current_dir, df.columns[0]))
#     curr_size = 0
#     for i in range(len(df)):
#         obj = df.ix[0:i]
#         curr_size = sys.getsizeof(obj)
#         if curr_size > max_size:
#             obj.to_csv()
        

In [52]:
#GPID_df.ix[0:125000,].to_csv("GPID.csv", index = False)