In [1]:
import pandas as pd
import numpy as np
from config import DATA_PATH
import os

In [2]:
data_path = DATA_PATH

In [3]:
gender_age_train = pd.read_csv(os.path.join(data_path, 'gender_age_train.csv'))
events = pd.read_csv(os.path.join(data_path, 'events.csv'))
app_events = pd.read_csv(os.path.join(data_path, 'app_events.csv'),
                         usecols=('is_installed', 'is_active', 'event_id', 'app_id'),
                         dtype={'is_installed': np.int8, 'is_active': np.int8, 'event_id': np.int64, 'app_id': np.int64})

In [4]:
gender_age_train = gender_age_train.loc[:, ['device_id', 'gender']]
events = events.loc[:, ['event_id', 'device_id']]

In [5]:
gender_age_train.head()

Unnamed: 0,device_id,gender
0,-8076087639492063270,M
1,-2897161552818060146,M
2,-8260683887967679142,M
3,-4938849341048082022,M
4,245133531816851882,M


In [6]:
events.head()

Unnamed: 0,event_id,device_id
0,1,29182687948017175
1,2,-6401643145415154744
2,3,-4833982096941402721
3,4,-6815121365017318426
4,5,-5373797595892518570


In [7]:
print gender_age_train.shape
print '#########################'
print events.shape
print '#########################'
app_events.shape

(74645, 2)
#########################
(3252950, 2)
#########################


(32473067, 4)

In [8]:
events_gender = pd.merge(events, gender_age_train, on='device_id')
print '#########################'
print events_gender.shape

#########################
(1215595, 3)


In [9]:
events_gender.head()

Unnamed: 0,event_id,device_id,gender
0,1,29182687948017175,M
1,7104,29182687948017175,M
2,29661,29182687948017175,M
3,33133,29182687948017175,M
4,38980,29182687948017175,M


In [10]:
gender_app = pd.merge(events_gender, app_events, on='event_id')
print '#########################'
print gender_app.shape

#########################
(12237197, 6)


In [11]:
gender_app.head()

Unnamed: 0,event_id,device_id,gender,app_id,is_installed,is_active
0,38980,29182687948017175,M,6666573791286858743,1,1
1,38980,29182687948017175,M,8693964245073640147,1,1
2,38980,29182687948017175,M,5551704094286985613,1,1
3,38980,29182687948017175,M,2689721421138748406,1,1
4,38980,29182687948017175,M,4377590530406372538,1,1


In [12]:
gender_stat = gender_age_train.groupby(['gender']).count().iloc[:,0:1]

In [13]:
gender_stat

Unnamed: 0_level_0,device_id
gender,Unnamed: 1_level_1
F,26741
M,47904


In [14]:
f_rank = float(gender_stat['device_id'][0])/(gender_stat['device_id'][0]+gender_stat['device_id'][1])
m_rank = float(gender_stat['device_id'][1])/(gender_stat['device_id'][0]+gender_stat['device_id'][1])
print 'man rank: ', m_rank
print 'female rank: ', f_rank

man rank:  0.64175765289
female rank:  0.35824234711


In [15]:
gender_app_short = gender_app.loc[:, ['app_id', 'gender']]
gender_app_short['counter'] = pd.Series([1]*len(gender_app_short.index), index=gender_app_short.index)

In [16]:
gender_app_short.head()

Unnamed: 0,app_id,gender,counter
0,6666573791286858743,M,1
1,8693964245073640147,M,1
2,5551704094286985613,M,1
3,2689721421138748406,M,1
4,4377590530406372538,M,1


In [17]:
gender_app_pivot = gender_app_short.pivot_table(values='counter', index='app_id', 
                                                columns=['gender'], fill_value=0, aggfunc=np.sum)

In [18]:
gender_app_pivot.head()

gender,F,M
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-9221156934682287334,13,3
-9220899153371182692,0,14
-9218487885271516150,0,2
-9218310540360546691,0,20
-9217104312935103667,0,38


In [19]:
gender_app_pivot['F'].values

array([13,  0,  0, ...,  0,  0, 18], dtype=int64)

In [20]:
gender_app_df = pd.DataFrame(zip(gender_app_pivot.index, gender_app_pivot['F'].values, gender_app_pivot['M'].values),
                                  columns=('app_id', 'F_count', 'M_count'))

In [21]:
gender_app_df['F_%'] = gender_app_df['F_count']/(gender_app_df['F_count']+gender_app_df['M_count'])

In [22]:
gender_app_df['M_%'] = 1 - gender_app_df['F_%']

In [23]:
gender_app_df.head()

Unnamed: 0,app_id,F_count,M_count,F_%,M_%
0,-9221156934682287334,13,3,0.8125,0.1875
1,-9220899153371182692,0,14,0.0,1.0
2,-9218487885271516150,0,2,0.0,1.0
3,-9218310540360546691,0,20,0.0,1.0
4,-9217104312935103667,0,38,0.0,1.0


In [24]:
gender_app_f_top = gender_app_df.copy()
gender_app_m_top = gender_app_df.copy()

In [29]:
gender_app_f_top = gender_app_f_top.sort_values(['F_%'], ascending=False)
gender_app_m_top = gender_app_m_top.sort_values(['M_%'], ascending=False)

In [28]:
gender_app_f_top.head()

Unnamed: 0,app_id,F_count,M_count,F_%,M_%
12413,7324914992979938040,10,0,1.0,0.0
6878,-4986139885441051,1,0,1.0,0.0
9820,4249252995056517148,3,0,1.0,0.0
215,-8890263377208798081,1,0,1.0,0.0
218,-8885691848766129496,1,0,1.0,0.0


In [30]:
gender_app_m_top.head()

Unnamed: 0,app_id,F_count,M_count,F_%,M_%
4463,-2711636555607861009,0,20,0.0,1.0
4242,-3073821809521644847,0,13,0.0,1.0
11451,6355520611494162802,0,6,0.0,1.0
9584,3875084989078273801,0,8,0.0,1.0
11452,6356902592769167940,0,34,0.0,1.0


In [31]:
gender_app_m_top.shape

(13762, 5)