<a href="https://colab.research.google.com/github/shravan1994/Talking-data-User-demographic-prediction/blob/main/model_training_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Our Approach 

<p>We did not get good accuracy with using single model to predict the user group</p>
<p>
In this notebook we tried another approach where we are trying to predict the gender first and using that predicted gender as a new features to predict the user demographic group.</p>

<p>
Also we trained seperate models for devices having events and the devices which don't have any events data.
</p>

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install xgboost --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
[K     |████████████████████████████████| 192.9 MB 86 kB/s 
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 0.90
    Uninstalling xgboost-0.90:
      Successfully uninstalled xgboost-0.90
Successfully installed xgboost-1.6.1


## Loading the data into memory

In [None]:
if __name__ == '__main__':
  from google.colab import drive
  drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
if __name__ == '__main__':
  import os
  os.environ['KAGGLE_CONFIG_DIR'] = "."

  !kaggle competitions download -c talkingdata-mobile-user-demographics

  !unzip '/content/talkingdata-mobile-user-demographics.zip'
  !unzip '/content/app_events.csv.zip' -d data
  !unzip '/content/app_labels.csv.zip' -d data
  !unzip '/content/events.csv.zip' -d data
  !unzip '/content/gender_age_test.csv.zip' -d data
  !unzip '/content/gender_age_train.csv.zip' -d data
  !unzip '/content/label_categories.csv.zip' -d data
  !unzip '/content/phone_brand_device_model.csv.zip' -d data

Downloading talkingdata-mobile-user-demographics.zip to /content
 94% 267M/283M [00:02<00:00, 125MB/s]
100% 283M/283M [00:02<00:00, 106MB/s]
Archive:  /content/talkingdata-mobile-user-demographics.zip
  inflating: app_events.csv.zip      
  inflating: app_labels.csv.zip      
  inflating: events.csv.zip          
  inflating: gender_age_test.csv.zip  
  inflating: gender_age_train.csv.zip  
  inflating: label_categories.csv.zip  
  inflating: phone_brand_device_model.csv.zip  
  inflating: sample_submission.csv.zip  
Archive:  /content/app_events.csv.zip
  inflating: data/app_events.csv     
Archive:  /content/app_labels.csv.zip
  inflating: data/app_labels.csv     
Archive:  /content/events.csv.zip
  inflating: data/events.csv         
Archive:  /content/gender_age_test.csv.zip
  inflating: data/gender_age_test.csv  
Archive:  /content/gender_age_train.csv.zip
  inflating: data/gender_age_train.csv  
Archive:  /content/label_categories.csv.zip
  inflating: data/label_categories.csv  


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer

from geopy.distance import geodesic, great_circle
from scipy.sparse import hstack, save_npz
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import LabelEncoder

In [None]:
# drive path to save models
drive_path = '/content/drive/MyDrive/Colab Notebooks/Self case study/Talking Data User Demographics Prediction/models'

## Getting different features, using below methods

### 1. Bag of categories

In [None]:
def get_bag_of_categories(devices, all_events_df, all_app_events_df, app_labels_df, label_categories_df, training=False):
  import pickle
  filename = f'{drive_path}/categories_vectorizer.sav'
  app_categories_df = app_labels_df.merge(label_categories_df, on='label_id', how='left')
  app_categories_dict = dict(zip(app_categories_df.app_id, app_categories_df.category))
  
  device_list = devices['device_id']
  events = all_events_df[all_events_df['device_id'].isin(device_list)][['event_id', 'device_id']]
  app_events = all_app_events_df[all_app_events_df['event_id'].isin(events['event_id'])]
  app_events = events.merge(app_events, on='event_id', how='right')
  app_events = app_events.groupby('device_id').app_id.apply(lambda x: ' '.join([app_categories_dict.get(app_id) for app_id in x])).to_frame()
  app_events['app_id'] = app_events['app_id'].str.lower()

  app_events = devices.merge(app_events, on='device_id', how='left')
  app_events['app_id'] = app_events['app_id'].fillna('na')

  if training:
    vocabulary = label_categories_df['category'].fillna('unknown').str.lower().unique().tolist() + ['na']
    vectorizer = CountVectorizer(vocabulary=vocabulary)
    categories_bow_vector = vectorizer.fit_transform(app_events['app_id'])
    
    # saving the vectorizer to drive
    pickle.dump(vectorizer, open(filename, 'wb'))
    return categories_bow_vector
  else:
    vectorizer = pickle.load(open(filename, 'rb'))
    categories_bow_vector = vectorizer.transform(app_events['app_id'])
    return categories_bow_vector

### 2. bag of installed app_ids

In [None]:
def get_bag_of_apps(devices, app_labels_df, all_events_df, all_app_events_df, label_categories_df, training=False):
  import pickle
  filename = f'{drive_path}/apps_vectorizer.sav'

  device_list = devices['device_id']
  app_categories_df = app_labels_df.merge(label_categories_df, on='label_id', how='left')

  events = all_events_df[all_events_df['device_id'].isin(device_list)][['event_id', 'device_id']]
  app_events = all_app_events_df[all_app_events_df['event_id'].isin(events['event_id'])]
  app_events = events.merge(app_events, on='event_id', how='right')
  app_events = app_events.groupby('device_id').app_id.apply(lambda x: ' '.join([str(app_id) for app_id in x])).to_frame()
  
  app_events = devices.merge(app_events, on='device_id', how='left')
  app_events['app_id'] = app_events['app_id'].fillna('na')

  if training:
    vocabulory = list(app_categories_df['app_id'].astype(str).unique()) + ['na']
    vectorizer = CountVectorizer(vocabulary=vocabulory, binary=True)
    apps_bow_vector = vectorizer.fit_transform(app_events['app_id'])

    # saving the vectorizer to drive
    pickle.dump(vectorizer, open(filename, 'wb'))
    return apps_bow_vector
  else:
    vectorizer = pickle.load(open(filename, 'rb'))
    categories_bow_vector = vectorizer.transform(app_events['app_id'])
    return categories_bow_vector

### 3. Bag of active apps

In [None]:
def get_bag_of_active_apps(devices, app_labels_df, all_events_df, all_app_events_df, label_categories_df, training=False):
  import pickle
  filename = f'{drive_path}/active_apps_vectorizer.sav'
  print('bag of apps')
  device_list = devices['device_id']
  app_categories_df = app_labels_df.merge(label_categories_df, on='label_id', how='left')

  events = all_events_df[all_events_df['device_id'].isin(device_list)][['event_id', 'device_id']]
  app_events = all_app_events_df[all_app_events_df['event_id'].isin(events['event_id'])]
  app_events = events.merge(app_events, on='event_id', how='right')
  app_events = app_events[app_events['is_active'] == 1]
  app_events = app_events.groupby('device_id').app_id.apply(lambda x: ' '.join([str(app_id) for app_id in x])).to_frame()
  
  app_events = devices.merge(app_events, on='device_id', how='left')
  app_events['app_id'] = app_events['app_id'].fillna('na')

  if training:
    vocabulory = list(app_categories_df['app_id'].astype(str).unique()) + ['na']
    vectorizer = CountVectorizer(vocabulary=vocabulory)
    apps_bow_vector = vectorizer.fit_transform(app_events['app_id'])

    # saving the vectorizer to drive
    pickle.dump(vectorizer, open(filename, 'wb'))
    return apps_bow_vector
  else:
    vectorizer = pickle.load(open(filename, 'rb'))
    apps_bow_vector = vectorizer.transform(app_events['app_id'])
    return apps_bow_vector

### 3. phone brand response encoding

In [None]:
def get_phone_brand_features_response_enc(devices, phone_brand_df, y_train, label=None, training=False):
  import pickle
  import json

  # if we want features for gender prediction 
  # then labels should be only 0, 1
  # and otherwise, it would be 12 labels 0 to 11
  if 'gender' in label:
    labels = np.array([0, 1])
  else:
    labels = np.array(list(range(0,12)))

  filename = f'{drive_path}/response_encoding_phone_brand{label}.pkl'

  phone_brand_df = phone_brand_df.groupby('device_id').agg(
      phone_brand = ('phone_brand', lambda x: ' '.join(np.unique(x)))
  ).reset_index()

  if training:
    encoding_dict = {}
    brand_and_model = devices.merge(phone_brand_df, on='device_id', how='left')
    brand_and_model['label'] = y_train

    for brand_name in brand_and_model['phone_brand'].unique():
      if not brand_name:
        continue

      temp = brand_and_model[brand_and_model['phone_brand'] == brand_name]
      totals = len(temp)
      
      encoding_list = []
      for label in labels:
        count = len(temp[temp['label'] == label])
        encoding_list.append(count/totals)

      encoding_dict.update({
          brand_name: encoding_list
      })
      
    brand_and_model[labels] = brand_and_model['phone_brand'].apply(lambda x: pd.Series(encoding_dict.get(x)))
    brand_and_model = brand_and_model.drop(['phone_brand', 'label'], axis=1)
    brand_and_model = brand_and_model[labels].values
    # save to disk
    json.dump(encoding_dict, open(filename, 'w'))

    return brand_and_model
  else:
    encoding_dict = json.load(open(filename))
    brand_and_model = devices.merge(phone_brand_df, on='device_id', how='left')
    brand_and_model[labels] = brand_and_model['phone_brand'].apply(lambda x: pd.Series(encoding_dict.get(x)))
    brand_and_model = brand_and_model.drop(['phone_brand'], axis=1)
    brand_and_model = brand_and_model[labels].values
    return brand_and_model

### 4. device model response encoding

In [None]:
def get_device_model_features_response_enc(devices, phone_brand_df, y_train, label=None, training=False):
  import pickle
  import json

  # if we want features for gender prediction 
  # then labels should be only 0, 1
  # and otherwise, it would be 12 labels 0 to 11
  if 'gender' in label:
    labels = np.array([0, 1])
  else:
    labels = np.array(list(range(0,12)))

  filename = f'{drive_path}/response_encoding_device_model_{label}.pkl'

  phone_brand_df = phone_brand_df.groupby('device_id').agg(
    device_model = ('device_model', lambda x: ' '.join(np.unique(x)))
  ).reset_index()

  if training:
    encoding_dict = {}
    brand_and_model = devices.merge(phone_brand_df, on='device_id', how='left')
    brand_and_model['label'] = y_train

    for device_model in brand_and_model['device_model'].unique():
      if not device_model:
        continue

      temp = brand_and_model[brand_and_model['device_model'] == device_model]
      totals = len(temp)
      
      encoding_list = []
      for label in labels:
        count = len(temp[temp['label'] == label])
        encoding_list.append(count/totals)

      encoding_dict.update({
          device_model: encoding_list
      })

    brand_and_model[labels] = brand_and_model['device_model'].apply(lambda x: pd.Series(encoding_dict.get(x)))
    brand_and_model = brand_and_model.drop(['device_model', 'label'], axis=1)
    brand_and_model = brand_and_model[labels].values
    # save to disk
    json.dump(encoding_dict, open(filename, 'w'))
    return brand_and_model
  else:
    encoding_dict = json.load(open(filename))

    brand_and_model = devices.merge(phone_brand_df, on='device_id', how='left')
    brand_and_model[labels] = brand_and_model['device_model'].apply(lambda x: pd.Series(encoding_dict.get(x)))
    brand_and_model = brand_and_model.drop(['device_model'], axis=1)
    brand_and_model = brand_and_model[labels].values
    return brand_and_model

### 5. Bag of Phone brands

In [None]:
def get_phone_brand_features_bow(devices, phone_brand_df, training=False):
  import pickle
  filename = f'{drive_path}/bow_phone_brand.pkl'
  vocabulary = phone_brand_df['phone_brand'].str.lower().unique().tolist()

  brand_and_model = devices.merge(phone_brand_df, on='device_id', how='left')
  brand_and_model = brand_and_model.groupby('device_id').phone_brand.apply(lambda x: ' '.join([s for s in x])).to_frame()
  
  if training:
    vectorizer = CountVectorizer(vocabulary=vocabulary)
    bow_vector = vectorizer.fit_transform(brand_and_model['phone_brand'])
    
    # saving the vectorizer to drive
    pickle.dump(vectorizer, open(filename, 'wb'))
    return bow_vector
  else:
    vectorizer = pickle.load(open(filename, 'rb'))
    bow_vector = vectorizer.transform(brand_and_model['phone_brand'])
    return bow_vector

### 6. label encoding phone brand

In [None]:
def get_phone_brand_features_label_encoded(devices, phone_brand_df, training=False):
  import pickle
  filename = f'{drive_path}/label_encode_phone_brand.pkl'
  vocabulary = phone_brand_df['phone_brand'].str.lower().unique().tolist()

  brand_and_model = devices.merge(phone_brand_df, on='device_id', how='left')
  brand_and_model = brand_and_model.groupby('device_id').phone_brand.apply(lambda x: [r for r in x][0]).to_frame()
  
  if training:
    le = LabelEncoder()
    le.fit(phone_brand_df['phone_brand'])
    label_encoded = le.transform(brand_and_model['phone_brand'])
    
    # saving the vectorizer to drive
    pickle.dump(le, open(filename, 'wb'))
    label_encoded.shape = (label_encoded.shape[0], 1)
    return label_encoded
  else:
    le = pickle.load(open(filename, 'rb'))
    label_encoded = le.transform(brand_and_model['phone_brand'])
    label_encoded.shape = (label_encoded.shape[0], 1)
    return label_encoded

In [None]:
# phone_brand_df = pd.read_csv('data/phone_brand_device_model.csv')
# encoded = get_phone_brand_features_label_encoded(devices, phone_brand_df, training=False)
# print(devices.shape)
# encoded.shape

### 6. Bag of device model

In [None]:
def get_device_model_features_bow(devices, phone_brand_df, training=False):
  import pickle
  filename = f'{drive_path}/bow_device_model.pkl'
  vocabulary = phone_brand_df['device_model'].str.lower().unique().tolist()

  device_model = devices.merge(phone_brand_df, on='device_id', how='left')
  device_model = device_model.groupby('device_id').device_model.apply(lambda x: ' '.join([s for s in x])).to_frame()

  if training:
    vectorizer = CountVectorizer(vocabulary=vocabulary)
    bow_vector = vectorizer.fit_transform(device_model['device_model'])
    # saving the vectorizer to drive
    pickle.dump(vectorizer, open(filename, 'wb'))
    return bow_vector
  else:
    vectorizer = pickle.load(open(filename, 'rb'))
    bow_vector = vectorizer.transform(device_model['device_model'])
    return bow_vector

### 7. Label encoded device model

In [None]:
def get_device_model_features_label_encoded(devices, phone_brand_df, training=False):
  import pickle
  filename = f'{drive_path}/label_encode_device_model.pkl'
  vocabulary = phone_brand_df['device_model'].str.lower().unique().tolist()

  brand_and_model = devices.merge(phone_brand_df, on='device_id', how='left')
  brand_and_model = brand_and_model.groupby('device_id').device_model.apply(lambda x: [r for r in x][0]).to_frame()
  
  if training:
    le = LabelEncoder()
    le.fit(phone_brand_df['device_model'])
    label_encoded = le.transform(brand_and_model['device_model'])
    
    # saving the vectorizer to drive
    pickle.dump(le, open(filename, 'wb'))
    label_encoded.shape = (label_encoded.shape[0], 1)
    return label_encoded
  else:
    le = pickle.load(open(filename, 'rb'))
    label_encoded = le.transform(brand_and_model['device_model'])
    label_encoded.shape = (label_encoded.shape[0], 1)
    return label_encoded

In [None]:
# phone_brand_df = pd.read_csv('data/phone_brand_device_model.csv')
# get_device_model_features_label_encoded(devices, phone_brand_df, training=False)

### 7. Total events

In [None]:
def get_total_events(devices, all_events_df, training=False):
  events = all_events_df[all_events_df['device_id'].isin(devices['device_id'])]
  events = devices.merge(events, on='device_id', how='left')
  events = events.groupby('device_id').event_id.apply(lambda x: len(x)).to_frame()
  return events['event_id'].values.reshape(-1, 1)

### 8. Total app events per device

In [None]:
def get_total_app_events(devices, all_events_df, all_app_events_df):
  print('app events')
  events = all_events_df[all_events_df['device_id'].isin(devices['device_id'])]
  app_events = all_app_events_df[all_app_events_df['event_id'].isin(events['event_id'])]
  app_events = events.merge(app_events, on='event_id', how='right')
  app_events = app_events.groupby('device_id').event_id.apply(lambda x: len(x)).to_frame()
  app_events = devices.merge(app_events, on='device_id', how='left')
  return app_events['event_id'].values.reshape(-1, 1)

### 8. Total apps

In [None]:
def total_apps_installed(devices, all_events_df, all_app_events_df, training=False):
  device_list = devices['device_id']
  events = all_events_df[all_events_df['device_id'].isin(device_list)][['event_id', 'device_id']]
  app_events = all_app_events_df[all_app_events_df['event_id'].isin(events['event_id'])]
  app_events = events.merge(app_events, on='event_id', how='right')
  total_apps = app_events.groupby('device_id').agg({'app_id': 'nunique'}).reset_index()
  total_apps = devices.merge(total_apps, on='device_id', how='left')
  total_apps['app_id'] = total_apps.app_id.fillna(0)
  total_apps['app_id'] = total_apps['app_id'].replace(np.inf, 0).replace(-np.inf, 0).replace(np.Infinity, 0).replace(np.NINF, 0).replace(float('inf'), 0)
  features = total_apps['app_id'].values
  features.shape = (features.shape[0], 1)
  return features

In [None]:
# arr = total_apps_installed(devices, all_events_df, all_app_events_df)
# # np.isinf(arr).any()
# # list(arr)
# print(arr.shape)
# hourly_events = get_hourly_events(devices, all_events_df, training=True)
# hstack(arr, hourly_events)

### 9. distance travelled

In [None]:
def distance_traveled_per_day(devices, all_events_df, training=False):
    import pickle
    filename = f'{drive_path}/distance_travelled_columns.sav'
    def get_distance_travelled(x):
        x = x.sort_values('datetime')
        locations = x.to_dict('records')
        total_d = 0
        
        for i, _ in enumerate(locations):
            if i == 0:
                continue
            
            # ignoring the invalid lat and long co-ordinates
            if locations[i-1].get('latitude') == 0.0 or locations[i].get('latitude') == 0.0 or locations[i-1].get('latitude') == 1.0 or locations[i].get('latitude') == 1.0:
                continue

            distance = great_circle(
                (locations[i-1].get('latitude'), locations[i-1].get('longitude')), 
                (locations[i].get('latitude'), locations[i].get('longitude'))
            ).km
            total_d += distance
        
        return total_d
    
    device_list = devices['device_id']
    device_events = all_events_df[all_events_df['device_id'].isin(device_list)].copy()
    device_events['datetime'] = pd.to_datetime(device_events['timestamp'])
    device_events['day'] = device_events['datetime'].dt.day

    if training:
      columns = list(device_events['day'].unique())
      
      # saving the vectorizer to drive
      pickle.dump(columns, open(filename, 'wb'))
    else:
      columns = pickle.load(open(filename, 'rb'))
      
    print(columns)
    device_events = device_events.groupby(['device_id', 'day'])[['datetime', 'latitude', 'longitude']].apply(
      lambda x: get_distance_travelled(x)
    ).to_frame().reset_index().rename(columns={0: 'distance'})
    
    device_distance_travelled = {}
    for row in device_events.itertuples():
      device_id = row.device_id
      distance = row.distance
      day = int(row.day)      
  
      if not device_distance_travelled.get(device_id):
        device_distance_travelled.update({
            device_id: {
                int(day): 0 for day in columns
            }
        })
        
      device_distance_travelled[device_id].update({
            'device_id': device_id,
            int(day): distance
        })

    device_distance = pd.DataFrame(device_distance_travelled.values())
    device_distance = devices.merge(device_distance, on='device_id', how='left')
    device_distance = device_distance.drop('device_id', axis=1)
    device_distance = device_distance.fillna(0)
    return device_distance.values

In [None]:
# devices = pd.read_csv('data/gender_age_train.csv')
# distance_travelled = distance_traveled_per_day(devices, all_events_df, training=False)

In [None]:
# distance_travelled.shape

### 10. Has events

In [None]:
def has_events(devices, all_events_df):
  import math

  device_has_events = all_events_df.groupby('device_id').agg({'event_id': 'nunique'}).reset_index()
  device_events = devices.merge(device_has_events, on='device_id', how='left')
  device_events['has_events'] = pd.notna(device_events['event_id'])
  device_events['has_events'] = device_events['has_events'].replace({True: 1})
  device_events['has_events'] = device_events['has_events'].replace({False: 0})
  return device_events['has_events'].values    

### 11. hourly events

In [None]:
def get_hourly_events(devices, all_events_df, training=False):
  import pickle
  filename = f'{drive_path}/hourly_events.pkl'

  events = all_events_df[all_events_df['device_id'].isin(devices['device_id'])]
  # events['hour'] = pd.to_datetime(events['timestamp']).dt.hour
  events['hour_day'] = pd.to_datetime(events['timestamp']).apply(lambda x: f"{x.day}_{x.hour}")
  events = events.groupby(['device_id']).hour_day.apply(lambda x: ' '.join([str(s) for s in x])).to_frame().reset_index()
  events = devices.merge(events, on='device_id', how='left')
  events.hour_day = events.hour_day.fillna('na')

  if training:
    vectorizer = CountVectorizer()
    bow_vector = vectorizer.fit_transform(events['hour_day'])
    pickle.dump(vectorizer, open(filename, 'wb'))
    return bow_vector
  else:
    vectorizer = pickle.load(open(filename, 'rb'))
    bow_vector = vectorizer.transform(events['hour_day'])
    return bow_vector

### Get median hour

In [None]:
def get_min_max_hourly_events(devices, all_events_df):
  events = all_events_df[all_events_df['device_id'].isin(devices['device_id'])]
  events['hour'] = pd.to_datetime(events['timestamp']).dt.hour
  events = events.groupby(['device_id']).agg({
      'hour': ['median', 'mean', 'min', 'max']
  }).reset_index()
  events = devices.merge(events, on='device_id', how='left')
  events = events.drop('device_id', axis=1)
  events = events.fillna(0)
  
  return events.values

In [None]:
# events = get_hourly_events(devices, all_events_df)
# events

### 11. median latitude and longitude

In [None]:
def get_median_lat_long(devices, all_events_df):
  device_list = devices['device_id']
  events = all_events_df[all_events_df['device_id'].isin(device_list)][['device_id', 'latitude', 'longitude']]
  lat_long = events.groupby('device_id').agg({
      'latitude': 'median',
      'longitude': 'median',
  }).reset_index()
  lat_long = devices.merge(lat_long, on='device_id', how='left')
  lat_long[['latitude', 'longitude']] = lat_long[['latitude', 'longitude']].fillna(0)
  lat_long[['latitude', 'longitude']] = lat_long[['latitude', 'longitude']].replace(np.inf, 0).replace(-np.inf, 0)
  return lat_long[['latitude', 'longitude']].values

### 12. average age per phone brand

In [None]:
def get_phone_brand_avg_age(devices, phone_brand_df, train_data_devices, training=False):
  import pickle
  import json
  
  filename_brand = f'{drive_path}/phone_brand_avg_age.pkl'
  filename_model = f'{drive_path}/device_model_avg_age.pkl'

  brand_df = phone_brand_df.groupby('device_id').agg(
      phone_brand = ('phone_brand', lambda x: ' '.join(np.unique(x)))
  ).reset_index()

  device_df = phone_brand_df.groupby('device_id').agg(
      device_model = ('device_model', lambda x: ' '.join(np.unique(x)))
  ).reset_index()
  
  if training:
    brand_age_df = brand_df.merge(train_data_devices[['device_id', 'age']], on='device_id', how='left')
    brands_avg = brand_age_df.groupby('phone_brand').agg({
        'age': 'mean',
    }).reset_index()
    brands_avg['age'] = brands_avg.age.fillna(0)
    brand_averages = dict(zip(brands_avg.phone_brand, brands_avg.age))

    # device_df = devices.merge(device_df, on='device_id', how='left')
    device_age_df = device_df.merge(train_data_devices[['device_id', 'age']], on='device_id', how='left')
    model_avg = device_age_df.groupby('device_model').agg({
        'age': 'mean',
    }).reset_index()
    model_avg['age'] = model_avg.age.fillna(0)
    model_averages = dict(zip(model_avg.device_model, model_avg.age))

    res_df = pd.DataFrame()
    brand_and_model = devices.merge(brand_df, on='device_id', how='left')
    res_df['avg_brand'] = brand_and_model['phone_brand'].apply(lambda x: brand_averages.get(x, 0))

    brand_and_model = devices.merge(device_df, on='device_id', how='left')
    res_df['avg_model'] = brand_and_model['device_model'].apply(lambda x: model_averages.get(x, 0))

    json.dump(brand_averages, open(filename_brand, 'w'))
    json.dump(model_averages, open(filename_model, 'w'))

    return res_df[['avg_brand', 'avg_model']].values
  else:
    brand_averages = json.load(open(filename_brand, 'r'))
    model_averages = json.load(open(filename_model, 'r'))
    
    res_df = pd.DataFrame()
    brand_and_model = devices.merge(brand_df, on='device_id', how='left')
    res_df['avg_brand'] = brand_and_model['phone_brand'].apply(lambda x: brand_averages.get(x, 0))

    brand_and_model = devices.merge(device_df, on='device_id', how='left')
    res_df['avg_model'] = brand_and_model['device_model'].apply(lambda x: model_averages.get(x, 0))
    return res_df[['avg_brand', 'avg_model']].values

In [None]:
# devices = pd.read_csv('data/gender_age_train.csv')
# phone_brand_df = pd.read_csv('data/phone_brand_device_model.csv')
# # train_data_devices = pd.read_csv('data/gender_age_train.csv')

# get_phone_brand_avg_age(devices, phone_brand_df, devices, training=True)

### 13.count(distinct(app_id))/count(*) as appid_proportion,

In [None]:
def get_app_id_proportion(devices, all_events_df, all_app_events_df, training=False):
  print('proportion appp_id')
  events = all_events_df[all_events_df['device_id'].isin(devices['device_id'])]
  app_events = all_app_events_df[all_app_events_df['event_id'].isin(events['event_id'])]
  app_events = events.merge(app_events, on='event_id', how='right')
  
  
  usage_proportion = app_events.groupby('device_id').agg(
      app_proportion = ('app_id', lambda x: (len(np.unique(x)) / len(x)) if len(x) else 0)
  ).reset_index()

  usage_proportion = devices.merge(usage_proportion, how='left', on='device_id')
  usage_proportion['app_proportion'] = usage_proportion['app_proportion'].fillna(0)

  return usage_proportion['app_proportion'].values.reshape(-1, 1)

### Extracting features in training data and save intermediate models to use on test data.

In [None]:
# all_events_df = pd.read_csv('data/events.csv')
# all_app_events_df = pd.read_csv('data/app_events.csv')
# app_labels_df = pd.read_csv('data/app_labels.csv')
# label_categories_df = pd.read_csv('data/label_categories.csv')
# phone_brand_df = pd.read_csv('data/phone_brand_device_model.csv')

## Get features for training data.
<p>We will also save the counter vectorizers used for multiple features, which we can use while preparing features for test data</p>

In [None]:
def select_k_best(features, y_train, label, n=1000, training=False):
  from sklearn.feature_selection import SelectKBest, chi2
  import pickle
  filename = f'{drive_path}/select_k_best_{label}.pkl'

  if training:
    kbest = SelectKBest(chi2, k=n)
    best_features = kbest.fit_transform(features, y_train)
    pickle.dump(kbest, open(filename, 'wb'))
    return best_features
  else:
    kbest = pickle.load(open(filename, 'rb'))
    best_features = kbest.transform(features)
    return best_features

In [None]:
def remove_features_with_no_variance(features, training=False, label='with_events'):
  from sklearn.feature_selection import VarianceThreshold
  import pickle
  filename = f'{drive_path}/no_variance_model_{label}.pkl'
  
  if training:
    selector = VarianceThreshold(threshold=0.0)
    best_features = selector.fit_transform(features)
    pickle.dump(selector, open(filename, 'wb'))
    return best_features
  else:
    selector = pickle.load(open(filename, 'rb'))
    best_features = selector.transform(features)
    return best_features

In [None]:
def get_features_with_events(train_data_devices, y_train, training=False, label='gender'):
  all_events_df = pd.read_csv('data/events.csv')
  all_app_events_df = pd.read_csv('data/app_events.csv')
  app_labels_df = pd.read_csv('data/app_labels.csv')
  label_categories_df = pd.read_csv('data/label_categories.csv')
  phone_brand_df = pd.read_csv('data/phone_brand_device_model.csv')

  categories_bow = get_bag_of_categories(
    train_data_devices, all_events_df, all_app_events_df, 
    app_labels_df, label_categories_df, training=training
  )
  apps_bow = get_bag_of_apps(train_data_devices, app_labels_df, all_events_df, all_app_events_df, label_categories_df, training=training)
  active_app_bow = get_bag_of_active_apps(train_data_devices, app_labels_df, all_events_df, all_app_events_df, label_categories_df, training=training)
  total_app_events = get_total_app_events(train_data_devices, all_events_df, all_app_events_df)
  app_id_proportion = get_app_id_proportion(train_data_devices, all_events_df, all_app_events_df)

  phone_brand_resp_enc = get_phone_brand_features_response_enc(
      train_data_devices, phone_brand_df, y_train, label=f'{label}_with_events', training=training)
  device_model_resp_enc = get_device_model_features_response_enc(
      train_data_devices, phone_brand_df, y_train, label=f'{label}_with_events', training=training)

  bow_phone_brand = get_phone_brand_features_bow(train_data_devices, phone_brand_df, training=training)
  bow_device_model = get_device_model_features_bow(train_data_devices, phone_brand_df, training=training)
  total_events = get_total_events(train_data_devices, all_events_df, training=training)
  total_apps = total_apps_installed(train_data_devices, all_events_df, all_app_events_df, training=training)
  distance_travelled = distance_traveled_per_day(train_data_devices, all_events_df, training=training)
  median_lat_long = get_median_lat_long(train_data_devices, all_events_df)
  hourly_events = get_hourly_events(train_data_devices, all_events_df, training=training)
  median_hour = get_min_max_hourly_events(train_data_devices, all_events_df)

  gender_age_train = pd.read_csv('data/gender_age_train.csv')
  avg_age = get_phone_brand_avg_age(train_data_devices, phone_brand_df, gender_age_train, training=training).astype(float)
  label_encoded_ph_brand = get_device_model_features_label_encoded(train_data_devices, phone_brand_df, training=training)
  label_encoded_device_model = get_phone_brand_features_label_encoded(train_data_devices, phone_brand_df, training=training)

  apps_bow_best = select_k_best(apps_bow, y_train, label=label, n=2000, training=training)

  print(categories_bow.shape)
  print(apps_bow_best.shape)
  print(bow_phone_brand.shape)
  print(bow_device_model.shape)
  print(total_events.shape)
  print(total_apps.shape)
  print(distance_travelled.shape)
  print(median_lat_long.shape)
  print(hourly_events.shape)
  print(total_app_events.shape)
  print(active_app_bow.shape)
  print(app_id_proportion.shape)
  print(avg_age.shape)
  print(median_hour.shape)
  print(label_encoded_ph_brand.shape)
  print(label_encoded_device_model.shape)

  X = hstack([categories_bow, apps_bow_best, bow_phone_brand, bow_device_model,
            total_events, phone_brand_resp_enc, device_model_resp_enc,
            total_apps, distance_travelled, avg_age, 
            total_app_events, median_lat_long, median_hour, label_encoded_ph_brand, label_encoded_device_model
  ])
  # , app_id_proportion, hourly_events, total_app_events, active_app_bow, ,   # median_lat_long, total_app_events
  # bad perf: active_app_bow
  print('data shape before removing no variance data: ', X.shape)
  print('training: ', training)
  X = remove_features_with_no_variance(X, training=training, label=f'{label}_with_events')
  print('data shape', X.shape)

  return X

In [None]:
def get_features_with_no_events(train_data_devices, y_train, training=False, label='gender'):
  phone_brand_df = pd.read_csv('data/phone_brand_device_model.csv')
  bow_phone_brand = get_phone_brand_features_bow(train_data_devices, phone_brand_df, training=training)
  bow_device_model = get_device_model_features_bow(train_data_devices, phone_brand_df, training=training)
  phone_brand_resp_enc = get_phone_brand_features_response_enc(
      train_data_devices, phone_brand_df, y_train, label=f'{label}_no_events', training=training)
  device_model_resp_enc = get_device_model_features_response_enc(
      train_data_devices, phone_brand_df, y_train, label=f'{label}_no_events', training=training)
  
  gender_age_train = pd.read_csv('data/gender_age_train.csv')
  avg_age = get_phone_brand_avg_age(train_data_devices, phone_brand_df, gender_age_train, training=training)
  
  label_encoded_ph_brand = get_device_model_features_label_encoded(train_data_devices, phone_brand_df, training=training)
  label_encoded_device_model = get_phone_brand_features_label_encoded(train_data_devices, phone_brand_df, training=training)
  
  print(phone_brand_resp_enc.shape)
  print(device_model_resp_enc.shape)
  print(avg_age.shape)
  print(label_encoded_ph_brand.shape)
  print(label_encoded_device_model.shape)

  X = hstack([phone_brand_resp_enc, device_model_resp_enc, bow_phone_brand, bow_device_model,
              label_encoded_ph_brand, label_encoded_device_model, avg_age])
  
  X = remove_features_with_no_variance(X, training=training, label=f'{label}_no_events')
  print('data shape', X.shape)
  
  return X

In [None]:
def get_labels(y, training=False, label='group'):
  import pickle

  filename = f'{drive_path}/label_encoder_{label}.pkl'

  if training:
    enc = LabelEncoder()
    y =  enc.fit_transform(y)
    pickle.dump(enc, open(filename, 'wb'))
    return y
  else:
    enc = pickle.load(open(filename, 'rb'))
    y =  enc.transform(y)
    return y

## Confusion matrix

In [None]:
def plot_confusion_matrix(test_y, predict_y, class_labels):
    C = confusion_matrix(test_y, predict_y)
    print("Number of misclassified points ",(len(test_y)-np.trace(C))/len(test_y)*100)
    
    A =(((C.T)/(C.sum(axis=1))).T)
    
    B =(C/C.sum(axis=0))
    print(1)
    labels = [0,1,2,3,4,5,6,7,8,9,10,11]
    cmap=sns.light_palette("green")

    # representing A in heatmap format
    print("-"*50, "Confusion matrix", "-"*50)
    plt.figure(figsize=(10,5))
    sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=class_labels, yticklabels=class_labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()
    print(2)
    print("-"*50, "Precision matrix", "-"*50)
    plt.figure(figsize=(10,5))
    sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=class_labels, yticklabels=class_labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()
    print(3)
    # representing B in heatmap format
    print("-"*50, "Recall matrix"    , "-"*50)
    plt.figure(figsize=(10,5))
    sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=class_labels, yticklabels=class_labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.show()
    print(4)

## Predicting gender for devices with events

In [None]:
def train_gender_predictor_with_events(devices_events_gen, y_gen_label, training=False):
  X_train = get_features_with_events(devices_events_gen, y_gen_label, training=training, label='gender')
  if training:
    print('train dataset\'s shape')
    print(X_train.shape)
    print(y_gen_label.shape)

    gpu_dict = {
      'objective': 'binary:logistic',
      'tree_method': 'gpu_hist',
      "eval_metric": "logloss"
    }

    param_grid = {
        'max_depth': [2, 3, 4], 
        'n_estimators': [200, 300, 400, 500],
        'scale_pos_weight': [10, 66]
        }
    xgb = XGBClassifier(**gpu_dict, random_state=42)

    gs = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid, 
            cv=4, 
            n_jobs=-1, 
            scoring='neg_log_loss',
            verbose=10
        )
    gs.fit(X_train, y_gen_label)

    print('Best params ', gs.best_params_)
    x_cfl = gs.best_estimator_
        
    cali_cfl = CalibratedClassifierCV(x_cfl, method="sigmoid")
    cali_cfl.fit(X_train, y_gen_label)
          
    predict_y_train_proba = cali_cfl.predict_proba(X_train)
    predict_y_train = cali_cfl.predict(X_train)
      
    # saving the model to disk for later usage
    import pickle
    file_name = f"{drive_path}/best_model_gender_with_events.pkl"
    pickle.dump(cali_cfl, open(file_name, "wb"))
    return predict_y_train, predict_y_train_proba
  else:
    print('testing')
    import pickle
    file_name = f"{drive_path}/best_model_gender_with_events.pkl"
    cali_cfl = pickle.load(open(file_name, "rb"))
    predict_y_test_proba = cali_cfl.predict_proba(X_train)
    predict_y_test = cali_cfl.predict(X_train)
    return predict_y_test, predict_y_test_proba

## Predicting gender based on devices with no events

In [None]:
def train_gender_predictor_with_no_events(devices_no_events_gen, y_gen_label, training=False):
  import pickle
  file_name = f"{drive_path}/best_model_gender_with_no_events.pkl"
  X_train = get_features_with_no_events(devices_no_events_gen, y_gen_label, training=training, label='gender')
  if training:
    print('train dataset\'s shape')
    print(X_train.shape)
    print(y_gen_label.shape)

    gpu_dict = {
      'objective': 'binary:logistic',
      'tree_method': 'gpu_hist',
      "eval_metric": "logloss"
    }

    param_grid = {
        'max_depth': [2, 3, 4], 
        'n_estimators': [200, 300, 400, 500],
        'scale_pos_weight': [10, 20, 33, 60]
        }
    xgb = XGBClassifier(**gpu_dict, random_state=42)

    gs = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid, 
            cv=4, 
            n_jobs=-1, 
            scoring='neg_log_loss',
            verbose=10
        )
    gs.fit(X_train, y_gen_label)

    print('Best params ', gs.best_params_)
    x_cfl = gs.best_estimator_
        
    cali_cfl = CalibratedClassifierCV(x_cfl, method="sigmoid")
    cali_cfl.fit(X_train, y_gen_label)
          
    predict_y_train_proba = cali_cfl.predict_proba(X_train)
    predict_y_train = cali_cfl.predict(X_train)
      
    # saving the model to disk for later usage
    pickle.dump(cali_cfl, open(file_name, "wb"))
    return predict_y_train, predict_y_train_proba
  else:
    cali_cfl = pickle.load(open(file_name, "rb"))
    predict_y_test_proba = cali_cfl.predict_proba(X_train)
    predict_y_test = cali_cfl.predict(X_train)
    return predict_y_test, predict_y_test_proba

## Predicting age on devices with events

In [None]:
def train_age_predictor_with_events(devices_events, y_tr_events, training=False):
  import pickle
  file_name = f"{drive_path}/best_model_age_predictor_with_events.pkl"
    
  X_events = get_features_with_events(devices_events, y_tr_events, training=training, label='age')
  
  if training:
    gpu_dict = {
      'objective': 'reg:squarederror',
      'tree_method': 'gpu_hist'
    }

    param_grid = {'max_depth': [2, 3, 4], 'n_estimators': [200, 300, 400, 500]}
    xgb = XGBRegressor(**gpu_dict, random_state=42)

    gs = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid, 
            cv=4, 
            n_jobs=-1, 
            scoring='neg_mean_squared_error',
            verbose=10
        )
    gs.fit(X_events, y_tr_events)
    print('Best params ', gs.best_params_)
    x_cfl = gs.best_estimator_
              
    predict_y = x_cfl.predict(X_events)
    
    # saving the model to disk for later usage
    pickle.dump(x_cfl, open(file_name, "wb"))
    return predict_y
  else:
    # loading the model
    x_cfl = pickle.load(open(file_name, "rb"))
    predict_y = x_cfl.predict(X_events)
    return predict_y

In [None]:
def train_age_predictor_with_events_1(devices_events, y_tr_events, training=False):
  import pickle
  from sklearn.linear_model import LinearRegression

  file_name = f"{drive_path}/best_model_age_predictor_with_events.pkl"
  normaliser_file = f"{drive_path}/predict_age_normaliser_events_file.pkl"
    
  X_events = get_features_with_events(devices_events, y_tr_events, training=training, label='age')
  
  if training:
    scaler = MaxAbsScaler()
    X_events = scaler.fit_transform(X_events)

    lr = LinearRegression()
    lr.fit(X_events, y_tr_events)            
    predict_y = lr.predict(X_events)
    
    # saving the model to disk for later usage
    pickle.dump(lr, open(file_name, "wb"))
    pickle.dump(scaler, open(normaliser_file, "wb"))
    
    return predict_y
  else:
    # loading the model
    scaler = pickle.load(open(normaliser_file, "rb"))
    lr = pickle.load(open(file_name, "rb"))
    X_events = scaler.transform(X_events)
    predict_y = lr.predict(X_events)
    return predict_y

In [None]:
def train_age_predictor_with_events_classif(devices_events, y_tr_events, training=False):
  import pickle
  file_name = f"{drive_path}/best_model_age_predictor_with_events.pkl"
  normaliser_file = f"{drive_path}/predict_age_normaliser_no_events_file.pkl"
    
  X_events = get_features_with_events(devices_events, y_tr_events, training=training, label='age')
  
  if training:
    gpu_dict = {
      'objective': 'multi:softproba',
      'tree_method': 'gpu_hist',
      "eval_metric": "mlogloss"
    }

    param_grid = {'max_depth': [2, 3, 4], 'n_estimators': [200, 300, 400, 500]}
    xgb = XGBClassifier(**gpu_dict, random_state=42)

    gs = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid, 
            cv=4, 
            n_jobs=-1, 
            scoring='neg_log_loss',
            verbose=10
        )
    gs.fit(X_events, y_tr_events)
    print('Best params ', gs.best_params_)
    x_cfl = gs.best_estimator_
        
    cali_cfl = CalibratedClassifierCV(x_cfl, method="sigmoid")
    cali_cfl.fit(X_events, y_tr_events)
          
    predict_y_proba = cali_cfl.predict_proba(X_events)
    predict_y = cali_cfl.predict(X_events)
    
    # saving the model to disk for later usage
    pickle.dump(cali_cfl, open(file_name, "wb"))
    return predict_y, predict_y_proba
  else:
    # loading the model
    xgb = pickle.load(open(file_name, "rb"))
    predict_y = xgb.predict(X_events)
    predict_y_proba = xgb.predict_proba(X_events)
    return predict_y, predict_y_proba

## Predicting age based on devices with no data

In [None]:
def train_age_predictor_without_events(devices_events, y_tr_events, training=False):
  import pickle
  file_name = f"{drive_path}/best_model_age_predictor_without_events.pkl"
    
  X_events = get_features_with_no_events(devices_events, y_tr_events, training=training, label='age')
  
  if training:
    gpu_dict = {
      'objective': 'reg:squarederror',
      'tree_method': 'gpu_hist'
    }

    param_grid = {'max_depth': [2, 3, 4], 'n_estimators': [200, 300, 400, 500]}
    xgb = XGBRegressor(**gpu_dict, random_state=42)

    gs = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid, 
            cv=4, 
            n_jobs=-1, 
            scoring='neg_mean_squared_error',
            verbose=10
        )
    gs.fit(X_events, y_tr_events)
    print('Best params ', gs.best_params_)
    x_cfl = gs.best_estimator_
              
    predict_y = x_cfl.predict(X_events)
    
    # saving the model to disk for later usage
    pickle.dump(x_cfl, open(file_name, "wb"))
    return predict_y
  else:
    # loading the model
    x_cfl = pickle.load(open(file_name, "rb"))
    predict_y = x_cfl.predict(X_events)
    return predict_y

In [None]:
def train_age_predictor_without_events_1(devices_events, y_tr_events, training=False):
  import pickle
  from sklearn.linear_model import LinearRegression

  file_name = f"{drive_path}/best_model_age_predictor_without_events.pkl"
  normaliser_file = f"{drive_path}/predict_age_normaliser_no_events_file.pkl"
    
  X_no_events = get_features_with_no_events(devices_events, y_tr_events, training=training, label='age')
  
  if training:
    scaler = MaxAbsScaler()
    X_no_events = scaler.fit_transform(X_no_events)

    lr = LinearRegression()
    lr.fit(X_no_events, y_tr_events)            
    predict_y = lr.predict(X_no_events)
    
    # saving the model to disk for later usage
    pickle.dump(lr, open(file_name, "wb"))
    pickle.dump(scaler, open(normaliser_file, "wb"))
    
    return predict_y
  else:
    # loading the model
    scaler = pickle.load(open(normaliser_file, "rb"))
    lr = pickle.load(open(file_name, "rb"))
    X_no_events = scaler.transform(X_no_events)
    predict_y = lr.predict(X_no_events)
    return predict_y

In [None]:
def train_age_predictor_without_events_classif(devices_events, y_tr_events, training=False):
  import pickle
  from sklearn.linear_model import LinearRegression

  file_name = f"{drive_path}/best_model_age_predictor_without_events.pkl"
  normaliser_file = f"{drive_path}/predict_age_normaliser_no_events_file.pkl"
    
  X_no_events = get_features_with_no_events(devices_events, y_tr_events, training=training, label='age')
  
  if training:
    gpu_dict = {
      'objective': 'multi:softproba',
      'tree_method': 'gpu_hist',
      "eval_metric": "mlogloss"
    }

    param_grid = {'max_depth': [2, 3, 4], 'n_estimators': [200, 300, 400, 500]}
    xgb = XGBClassifier(**gpu_dict, random_state=42)

    gs = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid, 
            cv=4, 
            n_jobs=-1, 
            scoring='neg_log_loss',
            verbose=10
        )
    gs.fit(X_no_events, y_tr_events)
    print('Best params ', gs.best_params_)
    x_cfl = gs.best_estimator_
        
    cali_cfl = CalibratedClassifierCV(x_cfl, method="sigmoid")
    cali_cfl.fit(X_no_events, y_tr_events)
          
    predict_y_proba = cali_cfl.predict_proba(X_no_events)
    predict_y = cali_cfl.predict(X_no_events)
    
    # saving the model to disk for later usage
    pickle.dump(cali_cfl, open(file_name, "wb"))
    return predict_y, predict_y_proba
  else:
    # loading the model
    xgb = pickle.load(open(file_name, "rb"))
    predict_y = xgb.predict(X_no_events)
    predict_y_proba = xgb.predict_proba(X_no_events)
    return predict_y, predict_y_proba

## Predicting user group based on predicted gender and age ,from other models
<p>We used naive bays to predict final user group</p>

In [None]:
def train_user_group_events_nb(X_data, y_tr_events, training=False):
  import pickle
  file_name = f"{drive_path}/best_model_group_predictor_with_events_nb.pkl"

  if training:
    parameters = {'alpha': [0.00001, 0.0005, 0.0001, 0.005, 0.001, 0.05, 0.01, 0.1, 0.5, 1, 5, 10, 50, 100, 1000, 10000]}
  
    nb_clf = MultinomialNB()  
    grid_search = GridSearchCV(nb_clf, parameters, cv=5, scoring='neg_log_loss', verbose=10, n_jobs=-1)
    grid_search.fit(X_data, y_tr_events)

    print('best params: ', grid_search.best_params_)

    nb_model = grid_search.best_estimator_

    sig_clf = CalibratedClassifierCV(nb_model, method="sigmoid")
    sig_clf.fit(X_data, y_tr_events)

    y_pred = sig_clf.predict(X_data)
    y_pred_proba = sig_clf.predict_proba(X_data)

    # saving the model to disk for later usage
    pickle.dump(sig_clf, open(file_name, "wb"))
    return y_pred, y_pred_proba
  else:
    sig_clf = pickle.load(open(file_name, "rb"))
    y_pred = sig_clf.predict(X_data)
    y_pred_proba = sig_clf.predict_proba(X_data)
    return y_pred, y_pred_proba

In [None]:
def train_user_group_events_xgb_with_events(
    devices_events, 
    predicted_age_gender, 
    y_tr_label, 
    training=False
  ):
  import pickle
  file_name = f"{drive_path}/best_model_group_predictor_with_events_xgb.pkl"

  X_events = get_features_with_events(devices_events, y_tr_label, training=training, label='group')
  X_events = hstack([X_events, predicted_age_gender])
  if training:
    gpu_dict = {
      'objective': 'multi:softproba',
      'tree_method': 'gpu_hist',
      "eval_metric": "mlogloss"
    }

    param_grid = {'max_depth': [2, 3, 4], 'n_estimators': [200, 300, 400, 500]}
    xgb = XGBClassifier(**gpu_dict, random_state=42)

    gs = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid, 
            cv=4, 
            n_jobs=-1, 
            scoring='neg_log_loss',
            verbose=10
        )
    gs.fit(X_events, y_tr_label)
    print('Best params ', gs.best_params_)
    x_cfl = gs.best_estimator_
        
    cali_cfl = CalibratedClassifierCV(x_cfl, method="sigmoid")
    cali_cfl.fit(X_events, y_tr_label)
          
    predict_y_proba = cali_cfl.predict_proba(X_events)
    predict_y = cali_cfl.predict(X_events)
    
    # saving the model to disk for later usage
    pickle.dump(cali_cfl, open(file_name, "wb"))
    return predict_y, predict_y_proba
  else:
    # loading the model
    print('testing part')
    cali_cfl = pickle.load(open(file_name, "rb"))
    predict_y_proba = cali_cfl.predict_proba(X_events)
    predict_y = cali_cfl.predict(X_events)
    return predict_y, predict_y_proba

In [None]:
def train_user_group_events_xgb_without_events(
    devices_no_events, 
    predicted_age_gender_no_events,
    y_tr_no_events_label, 
    training=False
  ):
  import pickle
  file_name = f"{drive_path}/best_model_group_predictor_without_events_xgb.pkl"

  X_events = get_features_with_no_events(devices_no_events, y_tr_no_events_label, training=training, label='group')
  X_events = hstack([X_events, predicted_age_gender_no_events])
  if training:
    gpu_dict = {
      'objective': 'multi:softproba',
      'tree_method': 'gpu_hist',
      "eval_metric": "mlogloss"
    }

    param_grid = {'max_depth': [2, 3, 4], 'n_estimators': [200, 300, 400, 500]}
    xgb = XGBClassifier(**gpu_dict, random_state=42)

    gs = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid, 
            cv=4, 
            n_jobs=-1, 
            scoring='neg_log_loss',
            verbose=10
        )
    gs.fit(X_events, y_tr_no_events_label)
    print('Best params ', gs.best_params_)
    x_cfl = gs.best_estimator_
        
    cali_cfl = CalibratedClassifierCV(x_cfl, method="sigmoid")
    cali_cfl.fit(X_events, y_tr_no_events_label)
          
    predict_y_proba = cali_cfl.predict_proba(X_events)
    predict_y = cali_cfl.predict(X_events)
    
    # saving the model to disk for later usage
    pickle.dump(cali_cfl, open(file_name, "wb"))
    return predict_y, predict_y_proba
  else:
    # loading the model
    print('testing part')
    cali_cfl = pickle.load(open(file_name, "rb"))
    predict_y_proba = cali_cfl.predict_proba(X_events)
    predict_y = cali_cfl.predict(X_events)
    return predict_y, predict_y_proba

## Predicting user groups on devices with events

In [None]:
def train_user_group_with_events(devices_events, gender_pred, y_tr_events, training=False):
  import pickle
  file_name = f"{drive_path}/best_model_group_with_events.pkl"
    
  X_events = get_features_with_events(devices_events, y_tr_events, training=training, label='group')
  X_events = hstack([X_events, gender_pred])
  
  if training:
    gpu_dict = {
      'objective': 'multi:softproba',
      'tree_method': 'gpu_hist',
      "eval_metric": "mlogloss"
    }

    param_grid = {'max_depth': [2, 3, 4], 'n_estimators': [200, 300, 400, 500]}
    xgb = XGBClassifier(**gpu_dict, random_state=42)

    gs = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid, 
            cv=4, 
            n_jobs=-1, 
            scoring='neg_log_loss',
            verbose=10
        )
    gs.fit(X_events, y_tr_events)
    print('Best params ', gs.best_params_)
    x_cfl = gs.best_estimator_
        
    cali_cfl = CalibratedClassifierCV(x_cfl, method="sigmoid")
    cali_cfl.fit(X_events, y_tr_events)
          
    predict_y_proba = cali_cfl.predict_proba(X_events)
    predict_y = cali_cfl.predict(X_events)
    
    # saving the model to disk for later usage
    pickle.dump(cali_cfl, open(file_name, "wb"))
    return predict_y, predict_y_proba
  else:
    # loading the model
    print('testing part')
    cali_cfl = pickle.load(open(file_name, "rb"))
    predict_y_proba = cali_cfl.predict_proba(X_events)
    predict_y = cali_cfl.predict(X_events)
    return predict_y, predict_y_proba

## Predicting user group on devices with no events

In [None]:
def train_user_group_with_no_events(devices_events, gender_pred, y_tr_events, training=False):
  import pickle
  file_name = f"{drive_path}/best_model_group_with_no_events.pkl"
    
  X_events = get_features_with_no_events(devices_events, y_tr_events, training=training, label='group')
  X_events = hstack([X_events, gender_pred])
  
  if training:
    gpu_dict = {
      'objective': 'multi:softproba',
      'tree_method': 'gpu_hist',
      "eval_metric": "mlogloss"
    }

    param_grid = {'max_depth': [2, 3, 4], 'n_estimators': [200, 300, 400, 500]}
    xgb = XGBClassifier(**gpu_dict, random_state=42)

    gs = GridSearchCV(
            estimator=xgb,
            param_grid=param_grid, 
            cv=4, 
            n_jobs=-1, 
            scoring='neg_log_loss',
            verbose=10
        )
    gs.fit(X_events, y_tr_events)
    print('Best params ', gs.best_params_)
    x_cfl = gs.best_estimator_
        
    cali_cfl = CalibratedClassifierCV(x_cfl, method="sigmoid")
    cali_cfl.fit(X_events, y_tr_events)
          
    predict_y_proba = cali_cfl.predict_proba(X_events)
    predict_y = cali_cfl.predict(X_events)
    
    # saving the model to disk for later usage
    pickle.dump(cali_cfl, open(file_name, "wb"))
    return predict_y, predict_y_proba
  else:
    # loading the model
    cali_cfl = pickle.load(open(file_name, "rb"))
    predict_y_proba = cali_cfl.predict_proba(X_events)
    predict_y = cali_cfl.predict(X_events)
    return predict_y, predict_y_proba


## Method to train and cross validate both models on training data

In [None]:
def train_models(devices, all_events_df):
  y = devices['group'].values
  is_events = has_events(devices, all_events_df)

  print('Predicting on devices with  events')
  devices_with_events = devices[is_events == 1]
  y_with_events = y[is_events == 1]

  tr_devices_events, te_devices_events, y_tr_events, y_te_events = train_test_split(
    devices_with_events, y_with_events, test_size=0.1, stratify=y_with_events
  )
  y_tr_label = get_labels(y_tr_events, training=True, label='group')
  y_te_label = get_labels(y_te_events, training=False, label='group')

  # predicting gender with devices on events data
  y_tr_events_gen = tr_devices_events['gender']
  y_te_events_gen = te_devices_events['gender']

  y_tr_gen_label = get_labels(y_tr_events_gen, training=True, label='gender')
  y_te_gen_label = get_labels(y_te_events_gen, training=False, label='gender')

  y_train_gen_pred, y_train_gen_pred_proba = train_gender_predictor_with_events(
      tr_devices_events[['device_id']], y_tr_gen_label, training=True)
  y_test_gen_pred, y_test_gen_pred_proba = train_gender_predictor_with_events(
      te_devices_events[['device_id']], y_te_gen_label, training=False)

  print ("The train log loss is:", log_loss(y_tr_gen_label, y_train_gen_pred_proba))
  print ("The test log loss is:", log_loss(y_te_gen_label, y_test_gen_pred_proba))

  # predicting group on devices with events
  y_train_pred, y_train_pred_proba = train_user_group_with_events(
      tr_devices_events[['device_id']], y_train_gen_pred_proba, y_tr_label, training=True)
  y_test_pred, y_test_pred_proba = train_user_group_with_events(
      te_devices_events[['device_id']], y_test_gen_pred_proba, y_te_label, training=False)

  print ("The train log loss is:", log_loss(y_tr_label, y_train_pred_proba))
  print ("The test log loss is:", log_loss(y_te_label, y_test_pred_proba))

  print('Predicting based on device without events')
  devices_no_events = devices[is_events == False]
  y_without_events = y[is_events == False]

  tr_devices_no_events, te_devices_no_events, y_tr_no_events, y_te_no_events = train_test_split(
    devices_no_events, y_without_events, test_size=0.1, stratify=y_without_events
  )
  y_tr_no_events_label = get_labels(y_tr_no_events, training=True, label='group')
  y_te_no_events_label = get_labels(y_te_no_events, training=False, label='group')

  # predicting gender on devices without events
  y_tr_no_events_gen_label = get_labels(tr_devices_no_events['gender'], training=True, label='gender')
  y_te_no_events_gen_label = get_labels(te_devices_no_events['gender'], training=False, label='gender')

  y_train_gen_pred_no_events, y_train_gen_pred_proba_no_events = train_gender_predictor_with_no_events(
      tr_devices_no_events[['device_id']], y_tr_no_events_gen_label, training=True)
  y_test_gen_pred_no_events, y_test_gen_pred_proba_no_events = train_gender_predictor_with_no_events(
      te_devices_no_events[['device_id']], y_te_no_events_gen_label, training=False
    )
  print ("The train log loss is:", log_loss(y_tr_no_events_gen_label, y_train_gen_pred_proba_no_events))
  print ("The test log loss is:", log_loss(y_te_no_events_gen_label, y_test_gen_pred_proba_no_events))

  # predicting group on devices without events
  y_train_pred_no_events, y_train_pred_no_events_proba = train_user_group_with_no_events(
      tr_devices_no_events[['device_id']], y_train_gen_pred_proba_no_events, y_tr_no_events_label, training=True)
  y_test_pred_no_events, y_test_pred_no_events_proba = train_user_group_with_no_events(
      te_devices_no_events[['device_id']], y_test_gen_pred_proba_no_events, y_te_no_events_label, training=False)

  print ("The train log loss is:", log_loss(y_tr_no_events_label, y_train_pred_no_events_proba))
  print ("The train log loss is:", log_loss(y_te_no_events_label, y_test_pred_no_events_proba))

  y_true = np.concatenate([y_te_label, y_te_no_events_label])
  y_pred = np.concatenate([y_test_pred_proba, y_test_pred_no_events_proba])

  print ("The overall test log loss is:", log_loss(y_true, y_pred))

In [None]:
# if __name__ == '__main__':
#   devices = pd.read_csv('data/gender_age_train.csv')
#   all_events_df = pd.read_csv('data/events.csv')
#   train_models(devices, all_events_df)

## Using seperate regression model for predicting age.

In [None]:
# devices = pd.read_csv('data/gender_age_train.csv')
# all_events_df = pd.read_csv('data/events.csv')

In [None]:
def get_age_groups(age_list, training=False):
  import pickle
  file_name = f"{drive_path}/age_group_encoder.pkl"

  def get_group(age):
    if age < 23:
      return '-23'
    elif age >= 23 and age <= 26:
      return '23-26'
    elif age >= 27 and age <= 28:
      return '27-28'
    elif age >= 29 and age <= 31:
      return '29-31'
    elif age >= 32 and age <= 38:
      return '32-38'
    elif age >= 39 and age <= 43:
      return '39-43'
    else:
      return '43+'

  age_groups = []
  for age in age_list:
    age_groups.append(get_group(age))

  if training:
    enc = LabelEncoder()
    y =  enc.fit_transform(age_groups)
    # saving the model to disk for later usage
    pickle.dump(enc, open(file_name, "wb"))
    return y
  else:
    # saving the model to disk for later usage
    enc = pickle.load(open(file_name, "rb"))
    y =  enc.transform(age_groups)
    return y

<b>Trying one model1 for gender model 2 for predicting age and model 3 for predicting user group based on model 1 and model 2 predictions</b>

In [None]:
if __name__ == '__main__':
  devices = pd.read_csv('data/gender_age_train.csv')
  all_events_df = pd.read_csv('data/events.csv')
  
  y = devices['group'].values

  is_events = has_events(devices, all_events_df)

  print('Predicting on devices with events')
  devices_with_events = devices[is_events == 1]
  y_with_events = y[is_events == 1]

  tr_devices_events, te_devices_events, y_tr_events, y_te_events = train_test_split(
    devices_with_events, y_with_events, test_size=0.1, stratify=y_with_events
  )
  y_tr_label = get_labels(y_tr_events, training=True, label='group')
  y_te_label = get_labels(y_te_events, training=False, label='group')

  # predicting gender with devices on events data
  y_tr_events_gen = tr_devices_events['gender']
  y_te_events_gen = te_devices_events['gender']

  y_tr_gen_label = get_labels(y_tr_events_gen, training=True, label='gender')
  y_te_gen_label = get_labels(y_te_events_gen, training=False, label='gender')

  y_train_gen_pred, y_train_gen_pred_proba = train_gender_predictor_with_events(
      tr_devices_events[['device_id']], y_tr_gen_label, training=True)
  y_test_gen_pred, y_test_gen_pred_proba = train_gender_predictor_with_events(
      te_devices_events[['device_id']], y_te_gen_label, training=False)

  print ("Gender: The train log loss is:", log_loss(y_tr_gen_label, y_train_gen_pred_proba))
  print ("Gender: The train log loss is:", log_loss(y_te_gen_label, y_test_gen_pred_proba))


  ##########################################
  y_tr_events_age = get_age_groups(tr_devices_events['age'], training=True)
  y_te_events_age = get_age_groups(te_devices_events['age'], training=False)

  y_train_age_pred, y_train_age_pred_prob = train_age_predictor_with_events_classif(
      tr_devices_events[['device_id']], y_tr_events_age, training=True)
  y_test_age_pred, y_test_age_pred_prob  = train_age_predictor_with_events_classif(
      te_devices_events[['device_id']], y_te_events_age, training=False)

  print ("Age: The train logloss is:", log_loss(y_tr_events_age, y_train_age_pred_prob))
  print ("Age: The train logloss is:", log_loss(y_te_events_age, y_test_age_pred_prob))

  ####################################
  # devices with no events
  devices_no_events = devices[is_events == 0]
  y_no_events = y[is_events == 0]

  tr_devices_no_events, te_devices_no_events, y_tr_no_events, y_te_no_events = train_test_split(
    devices_no_events, y_no_events, test_size=0.1, stratify=y_no_events
  )

  y_tr_no_events_label = get_labels(y_tr_no_events, training=True, label='group')
  y_te_no_events_label = get_labels(y_te_no_events, training=False, label='group')

  # predicting gender on devices without events
  y_tr_no_events_gen_label = get_labels(tr_devices_no_events['gender'], training=True, label='gender')
  y_te_no_events_gen_label = get_labels(te_devices_no_events['gender'], training=False, label='gender')

  y_train_gen_pred_no_events, y_train_gen_pred_proba_no_events = train_gender_predictor_with_no_events(
      tr_devices_no_events[['device_id']], y_tr_no_events_gen_label, training=True)
  y_test_gen_pred_no_events, y_test_gen_pred_proba_no_events = train_gender_predictor_with_no_events(
      te_devices_no_events[['device_id']], y_te_no_events_gen_label, training=False
    )
  print ("The train log loss is:", log_loss(y_tr_no_events_gen_label, y_train_gen_pred_proba_no_events))
  print ("The test log loss is:", log_loss(y_te_no_events_gen_label, y_test_gen_pred_proba_no_events))

  # predicting age
  y_tr_no_events_age = get_age_groups(tr_devices_no_events['age'], training=True)
  y_te_no_events_age = get_age_groups(te_devices_no_events['age'], training=False)

  y_train_no_event_age_pred, y_train_no_event_age_pred_proba = train_age_predictor_without_events_classif(tr_devices_no_events[['device_id']], y_tr_no_events_age, training=True)
  y_test_no_event_age_pred, y_test_no_event_age_pred_proba  = train_age_predictor_without_events_classif(te_devices_no_events[['device_id']], y_te_no_events_age, training=False)

  print ("The train logloss is:", log_loss(y_tr_no_events_age, y_train_no_event_age_pred_proba))
  print ("The train logloss is:", log_loss(y_te_no_events_age, y_test_no_event_age_pred_proba))



  # final model
  X_events = np.hstack([y_train_gen_pred_proba, y_train_age_pred_prob])
  X_no_events = np.hstack([y_train_gen_pred_proba_no_events, y_train_no_event_age_pred_proba])
  x_train = np.vstack([X_events, X_no_events])
  y_train_total = np.hstack([y_tr_label, y_tr_no_events_label])
  print('train data')
  print(x_train.shape)
  print(y_train_total.shape)

  X_events_test = np.hstack([y_test_gen_pred_proba, y_test_age_pred_prob])
  X_no_events_test = np.hstack([y_test_gen_pred_proba_no_events, y_test_no_event_age_pred_proba])
  x_test = np.vstack([X_events_test, X_no_events_test])
  y_test_total = np.hstack([y_te_label, y_te_no_events_label])
  print('test data')
  print(x_test.shape)
  print(y_test_total.shape)

  y_train_events_pred, y_train_pred_events_proba = train_user_group_events_xgb_with_events(
      tr_devices_events[['device_id']], X_events,
      y_tr_label, training=True)

  y_test_events_pred, y_test_pred_events_proba = train_user_group_events_xgb_with_events(
      te_devices_events[['device_id']], X_events_test,
      y_te_label, training=False)

  y_train_no_events_pred, y_train_pred_no_events_proba = train_user_group_events_xgb_without_events(
      tr_devices_no_events[['device_id']], X_no_events,
      y_tr_no_events_label, training=True)

  y_test_no_events_pred, y_test_pred_no_events_proba = train_user_group_events_xgb_without_events(
      te_devices_no_events[['device_id']], X_no_events_test,
      y_te_no_events_label, training=False)

  y_train_total = np.concatenate([y_tr_label, y_tr_no_events_label])
  y_train_pred_proba = np.concatenate([y_train_pred_events_proba, y_train_pred_no_events_proba])

  y_test_total = np.concatenate([y_te_label, y_te_no_events_label])
  y_test_pred_proba = np.concatenate([y_test_pred_events_proba, y_test_pred_no_events_proba])

  print ("The overall train log loss is:", log_loss(y_train_total, y_train_pred_proba))
  print ("The overall train log loss is:", log_loss(y_test_total, y_test_pred_proba))

Predicting on devices with events
bag of apps
app events
proportion appp_id
[1, 30, 2, 3, 4, 5, 6, 7, 8]
(20978, 827)
(20978, 2000)
(20978, 131)
(20978, 1596)
(20978, 1)
(20978, 1)
(20978, 9)
(20978, 2)
(20978, 170)
(20978, 1)
(20978, 113212)
(20978, 1)
(20978, 2)
(20978, 4)
(20978, 1)
(20978, 1)
data shape before removing no variance data:  (20978, 4580)
training:  True
data shape (20978, 2833)
train dataset's shape
(20978, 2833)
(20978,)
Fitting 4 folds for each of 24 candidates, totalling 96 fits
Best params  {'max_depth': 4, 'n_estimators': 500, 'scale_pos_weight': 10}
bag of apps
app events
proportion appp_id
[1, 30, 2, 3, 4, 5, 6, 7, 8]
(2331, 827)
(2331, 2000)
(2331, 131)
(2331, 1596)
(2331, 1)
(2331, 1)
(2331, 9)
(2331, 2)
(2331, 170)
(2331, 1)
(2331, 113212)
(2331, 1)
(2331, 2)
(2331, 4)
(2331, 1)
(2331, 1)
data shape before removing no variance data:  (2331, 4580)
training:  False
data shape (2331, 2833)
testing
Gender: The train log loss is: 0.37036301588167103
Gender: The t

In [None]:
# np.concatenate([y_tr_label, y_tr_no_events_label]).shape

(67180,)

In [None]:
# np.concatenate([y_train_pred_events_proba, y_train_pred_no_events_proba]).shape

(67180, 12)

In [None]:
# df = pd.DataFrame({'a': [0,1,4, np.inf, -np.inf]})
# # df = df.replace(np.inf, 0)
# df.head()


# print('inf', np.isfinite(df.values).all())

# df.values  

In [None]:
# if __name__ == '__main__':
# devices = pd.read_csv('data/gender_age_train.csv')
# all_events_df = pd.read_csv('data/events.csv')
  # x_train, x_test = train_models_with_seperate_age_predictor(devices, all_events_df)

# Summary

*   With same number of features and using the single model to predict user group, was giving very poor results.
*   So, we, train four different models as described below along with their corresponding log loss



In [None]:
if __name__ == '__main__':
  from tabulate import tabulate
  data = [
          ['Yes', "Gender", 0.49164734136922056],
          ["Yes", "User group (used predicted gender from above model as new feature)", 2.133802776693025],
          ["No", "Gender", 0.6511259420027772],
          ["No", "User group (used predicted gender from above model as new feature)", 2.420144409974926]
        ]
  print(tabulate(data, headers=["Is events data available?", "What we are predicting?", "Test log loss"]))

Is events data available?    What we are predicting?                                               Test log loss
---------------------------  ------------------------------------------------------------------  ---------------
Yes                          Gender                                                                     0.491647
Yes                          User group (used predicted gender from above model as new feature)         2.1338
No                           Gender                                                                     0.651126
No                           User group (used predicted gender from above model as new feature)         2.42014
