### II. Analysis

#### Checking gender_age tables for uniqueness of device ID

In [1]:
# Set up
import numpy as np
import pandas as pd
import os

# Load data
datadir = "../data/extract/"
ga_train = pd.read_csv(os.path.join(datadir, 'gender_age_train.csv'), dtype='str')
ga_test = pd.read_csv(os.path.join(datadir, 'gender_age_test.csv'), dtype='str')

# Define a function that tells you how many values are in an array and how many are unique.
def checkUnique(arr):
    ids = np.array(arr)
    idsUnique = np.unique(ids)
    print "There are %d values in the array and %d are unique." % (len(ids), len(idsUnique))

# Train IDs
checkUnique(ga_train.device_id)

# Test IDs
checkUnique(ga_test.device_id)

# Combined IDs
checkUnique(np.concatenate((ga_train.device_id, ga_test.device_id)))

There are 74645 values in the array and 74645 are unique.
There are 112071 values in the array and 112071 are unique.
There are 186716 values in the array and 186716 are unique.


#### Checking phone_brand_device_model

In [2]:
# Load data
pbdm = pd.read_csv(os.path.join(datadir, 'phone_brand_device_model.csv'), dtype='str')

# Check device_id uniqueness
checkUnique(pbdm.device_id)

There are 187245 values in the array and 186716 are unique.


In [3]:
# Investigating duplicate device IDs in pbdm

# How many duplicate device IDs do we have?
print "There are %d duplicate device IDs." % pbdm.groupby(by='device_id').count()[pbdm.groupby(by='device_id').count()['phone_brand']>1].shape[0]

# How many are just duplicate rows (phone_brand and device_model do not change across rows)
print "Of those, %d are duplicates across the entire row." % len(np.unique(np.array(pbdm[pbdm.duplicated(keep=False)].device_id)))

# Print out the remaining bad dups
dups = pbdm.groupby(by='device_id').count()[pbdm.groupby(by='device_id').count()['phone_brand']>1].index.values
gooddups = np.unique(np.array(pbdm[pbdm.duplicated(keep=False)].device_id))
ind = np.invert(np.in1d(dups, gooddups))
baddups = dups[ind]

print "These are the remaining 6:"
pbdm[np.in1d(pbdm.device_id, baddups)].sort_values('device_id')

There are 529 duplicate device IDs.
Of those, 523 are duplicates across the entire row.
These are the remaining 6:


Unnamed: 0,device_id,phone_brand,device_model
93020,-3004353610608679970,酷派,5891
128599,-3004353610608679970,酷派,7296
140501,-5269721363279128080,三星,Galaxy Core Advance
185896,-5269721363279128080,小米,MI 3
69302,-6590454305031525112,小米,MI 3
140520,-6590454305031525112,华为,荣耀6 Plus
81222,-7059081542575379359,LG,Nexus 5
93035,-7059081542575379359,魅族,魅蓝Note 2
45947,-7297178577997113203,华为,荣耀畅玩5X
112903,-7297178577997113203,华为,荣耀3C


#### Checking events

In [4]:
# Load data
events = pd.read_csv(os.path.join(datadir, 'events.csv'), dtype='str')

# Check device_id uniqueness
checkUnique(events.event_id)

There are 3252950 values in the array and 3252950 are unique.


#### Checking label categories

In [5]:
# Load data
label_categories = pd.read_csv(os.path.join(datadir, 'label_categories.csv'), dtype='str')

# Check unique label_id
checkUnique(label_categories.label_id)
checkUnique(label_categories.category)

There are 930 values in the array and 930 are unique.
There are 930 values in the array and 836 are unique.


  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [6]:
# View label categories that have duplicates
label_categories.groupby('category').count()[np.array(label_categories.groupby('category').count().label_id>=2)].head()

Unnamed: 0_level_0,label_id
category,Unnamed: 1_level_1
ARPG,2
Accounting,2
Animation,2
Car,2
Crowdfunding,2


#### Distribution of Group in Labeled Data

In [7]:
group_dist = (ga_train.groupby('group').count()/ga_train.shape[0])[['gender']]
group_dist.rename(columns={'gender':'distribution'}, inplace=True)

# Print out the distribution and transpose it for easier interpretation
group_dist.transpose() * 100

group,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+
distribution,6.765356,5.613236,4.177105,6.200013,7.44993,5.618595,10.031482,12.867573,7.294527,9.791681,12.694755,11.495747


In [8]:
# Distribution of gender in labeled data
gender_dist = (ga_train.groupby('gender').count()/ga_train.shape[0])[['device_id']]
gender_dist.rename(columns={'device_id':'distribution'}, inplace=True)

gender_dist.transpose()

gender,F,M
distribution,0.358242,0.641758


### III. Methodology

#### Data Cleaning and Encoding

In [2]:
# Create gender_age_train_final and gender_age_test_final
# train columns: device_id_enc (index), group_enc
# test column: device_id_enc (index)

from sklearn.preprocessing import LabelEncoder

all_device_ids = np.concatenate((ga_train.device_id, ga_test.device_id))

# Encode device IDs
dev_id_enc = LabelEncoder().fit(all_device_ids)
ga_train['device_id_enc'] = dev_id_enc.transform(ga_train['device_id'])
ga_test['device_id_enc'] = dev_id_enc.transform(ga_test['device_id'])

# Encode group
group_enc = LabelEncoder().fit(ga_train.group)
ga_train['group_enc'] = group_enc.transform(ga_train['group'])

# Create final dataframes
ga_train_final = ga_train[['device_id_enc', 'group_enc']].set_index('device_id_enc')
ga_test_final = ga_test[['device_id_enc']].set_index('device_id_enc')

# Write to csv
ga_train_final.to_csv(path_or_buf='../data/final-data-files/ga_train_final.csv')
ga_test_final.to_csv(path_or_buf='../data/final-data-files/ga_test_final.csv')

In [18]:
# Create pbdm_final
# Columns: device_id_enc (index), phone_brand_enc, device_model_enc

# Drop duplicates in PBDM, encode device_id, set device_id_enc as the index
pbdm_final = pbdm.drop_duplicates(subset='device_id', keep='first')
pbdm_final['device_id_enc'] = dev_id_enc.transform(pbdm_final['device_id'])

pbdm_final = pbdm_final.set_index('device_id_enc')

# Encode phone brand
pb_enc = LabelEncoder().fit(pbdm_final.phone_brand)
pbdm_final['phone_brand_enc'] = pb_enc.transform(pbdm_final['phone_brand'])

# Encode device model
dm_enc = LabelEncoder().fit(pbdm_final.device_model)
pbdm_final['device_model_enc'] = dm_enc.transform(pbdm_final['device_model'])

# Keep only encoded columns
pbdm_final = pbdm_final[['phone_brand_enc', 'device_model_enc']]

# Write to csv
pbdm_final.to_csv(path_or_buf='../data/final-data-files/pbdm_final.csv')

pbdm_final.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,phone_brand_enc,device_model_enc
device_id_enc,Unnamed: 1_level_1,Unnamed: 2_level_1
89024,51,1517
96788,51,749
140101,15,560


In [19]:
# Create events_final
# Columns: event_id (index), device_id_enc

# Filter out device_ids that are not in ga_train or ga_test
events_final = events[np.in1d(events.device_id, all_device_ids, invert=False)]

# Encode device_id
events_final = events_final[['event_id', 'device_id']]
events_final['device_id_enc'] = dev_id_enc.transform(events_final['device_id'])

# Keep only event_id, device_id_enc columns. Set index.
events_final = events_final[['event_id', 'device_id_enc']].set_index('event_id')

# Write to csv
events_final.to_csv(path_or_buf='../data/final-data-files/events_final.csv')

### Model Evaluation and Validation

#### Unencoded Submissions

In [3]:
sub_enc = pd.read_csv('../submissions/encoded/prediction-0149.csv')

In [4]:
# Unencode device ID
sub_unenc = pd.DataFrame(sub_enc)
sub_unenc['device_id'] = dev_id_enc.inverse_transform(sub_unenc['device_id_enc'])

# Rearrange columns and drop encoded device ID
sub_unenc = sub_unenc[['device_id', '0','1','2','3','4','5','6','7','8','9','10','11']].set_index('device_id')

# Rename columns
sub_unenc = sub_unenc.rename(columns={'0':'F23-','1':'F24-26','2':'F27-28','3':'F29-32','4':'F33-42','5':'F43+',
                                      '6':'M22-','7':'M23-26','8':'M27-28','9':'M29-31','10':'M32-38','11':'M39+'})

# Output to csv
sub_unenc.to_csv('../submissions/prediction-on-sparse-matrices.csv')

### ==================================================================

### This all gets done in R now

In [128]:
# Create app_labels_final
# Columns: app_id, app_id_enc, label_id_enc

# Load app_labels
app_labels = pd.read_csv(os.path.join(datadir, 'app_labels.csv'), dtype={'app_id':'str', 'label_id':'str'})

# Create a primary key field that concatenates app_id and app_label
app_labels_pk = []

for i in zip(app_labels.app_id, app_labels.label_id):
    app_labels_pk.append(i[0]+'_'+i[1])
    
app_labels['app_label_pk'] = app_labels_pk

# Drop duplicates using the primary key
app_labels = app_labels.drop_duplicates(subset='app_label_pk', keep='first')

# Encode app_ids
app_id_enc = LabelEncoder().fit(app_labels.app_id)
app_labels['app_id_enc'] = app_id_enc.transform(app_labels['app_id'])

# Encode label_ids
label_id_enc = LabelEncoder().fit(app_labels.label_id)
app_labels['label_id_enc'] = label_id_enc.transform(app_labels['label_id'])

# Keep only app_id, app_id_enc, label_id_enc columns
app_labels_final = app_labels[['app_id', 'app_id_enc', 'label_id_enc']]

# Write to csv
app_labels_final.to_csv(path_or_buf='../data/final-data-files/app_labels_final.csv', index=False)

In [147]:
# Create label_categories_final
# Columns: label_id_enc, category_enc

# Filter out label_ids that are not in app_labels_final
label_categories_final = label_categories[np.in1d(label_categories.label_id, np.unique(app_labels.label_id))]

# Encode label_id
label_categories_final['label_id_enc'] = label_id_enc.transform(label_categories_final['label_id'])

# Encode category
category_enc = LabelEncoder().fit(label_categories_final.category)
label_categories_final['category_enc'] = category_enc.transform(label_categories_final['category'])

# Keep only label_id_enc, category_enc
label_categories_final = label_categories_final[['label_id_enc', 'category_enc']]

# Write to csv
label_categories_final.to_csv(path_or_buf='../data/final-data-files/label_categories_final.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [151]:
label_categories_final.head()

Unnamed: 0,label_id_enc,category_enc
1,100,375
3,203,349
4,217,360
5,242,355
6,261,356


In [145]:
category_enc.classes_

array(['1 free', '1 reputation', '1 vitality', '3 kindom game',
       '80s Japanese comic', '90s Japanese comic', 'A beauty care',
       'A shares', 'ARPG', 'Academic Information', 'Accounting',
       'Adventure Game', 'Aeronautical Information Service', 'Air Travel',
       'Airline Alliances', 'Airline type', 'And the Church', 'Animation',
       'Animation aggregate class', 'Antique collection', 'App Store',
       'Appliances', 'Astrology Horoscope', 'Audiobooks',
       'Automotive News', 'Aviation Integrated Services',
       'Bank Credit Card', 'Bank financing', 'Beach landing game',
       'Beauty Nail', 'Behalf of the drive', 'Bond Fund',
       'Book hotel complex', 'Booking channels', 'Browser', 'Bus',
       'Business Office', 'Business simulation', 'Buy', 'Buy class',
       'Calendar', 'Car', 'Car Owners', 'Card Game', 'Cards RPG',
       'Casual puzzle categories', 'Chess categories', 'Chess game',
       'Children puzzle game', 'Chinese Classical Mythology',
       '

# ================== Scratch, delete later ==================

In [63]:
len(np.unique(app_labels.app_id))

113211

In [40]:
app_labels[app_labels.app_id=='-1035709620851836587']

# -1035709620851836587_204

Unnamed: 0,app_id,label_id,app_label_pk
430731,-1035709620851836587,204,-1035709620851836587_204
435491,-1035709620851836587,204,-1035709620851836587_204


In [35]:
len(np.unique(app_labels_pk))

459452

In [57]:
len(np.unique(pbdm.device_model))
np.unique(pbdm.phone_brand_enc)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130])

In [4]:
app_events = pd.read_csv(os.path.join(datadir, 'app_events.csv'), dtype='str', usecols=['app_id'])

In [5]:
app_ids_unique = np.unique(app_events.app_id)

In [6]:
app_id_encoder_AE = LabelEncoder().fit(app_ids_unique)

In [7]:
app_events['app_id_enc'] = app_id_encoder_AE.transform(app_events['app_id'])

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
datadir = "../data/extract/"

In [3]:
from sklearn.preprocessing import LabelEncoder

In [9]:
app_events.to_csv(path_or_buf='../data/processed/app_events_enc.csv', index=False)

In [10]:
app_labels = pd.read_csv(os.path.join(datadir, 'app_labels.csv'), dtype={'app_id':'str', 'label_id':'str'})

In [12]:
app_labels['app_id_enc'] = app_id_encoder_AE.transform(app_labels['app_id'])

ValueError: y contains new labels: ['-1000044011832266039' '-1000044012126315025' '-1000437729193132121' ...,
 '999394088888896055' '999559308866387142' '999834977830134602']

In [10]:
ga_train_final.head()

Unnamed: 0_level_0,group_enc
device_id_enc,Unnamed: 1_level_1
79861,10
21314,10
81925,10
44323,9
109823,9


In [13]:
ga_train.sort_values('device_id_enc').head()

Unnamed: 0,device_id,gender,age,group,device_id_enc,group_enc
68487,-1000369272589010951,F,26,F24-26,4,1
1280,-1000572055892391496,F,27,F27-28,5,2
56524,-1000643208750517791,M,29,M29-31,8,9
39902,-1001337759327042486,M,30,M29-31,14,9
72600,-1001949518704267063,M,22,M22-,20,6


In [17]:
ga_test.sort_values('device_id_enc').head()

Unnamed: 0,device_id,device_id_enc
45469,-1000025442746372936,0
56617,-1000030473234264316,1
55532,-1000146476441213272,2
11627,-100015673884079572,3
18155,-1000633257325356587,6


In [16]:
ga_test_final.head()

93645
6025
165147
58765
55140
