# <center>Upgrad Capstone</center>
#### <center>`Project facilitator: Prashant Bhide`&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`Project partner: Anilkumar Narayanam`</center>

## Subtask 5 - Model Building - Data Segmentation

In [1]:
# Importing the required libraries

import warnings
import numpy as np
import pandas as pd

In [2]:
# Global configurations

warnings.filterwarnings('ignore')

pd.set_option('display.float_format', '{:.2f}'.format)
pd.options.display.max_colwidth = 100

### Encoding the Data

In [3]:
%%time
# We will read the master dataset from earlier stage

df_master = pd.read_csv('df_master.csv', dtype={'device_id':str, 'age':np.int8, 'event_id':str})
display(df_master)
df_master.info()

Unnamed: 0,device_id,gender,age,event_id,hour,dayofweek,avg_events,cluster_id,is_active,gaming,...,family,children,education,lifestyle,social,services,others,phone_brand,device_model,train_test_flag
0,-1000369272589010000,F,26,,,,0,,,,...,,,,,,,,vivo,Y17T,test
1,-1000572055892390000,F,27,,,,0,,,,...,,,,,,,,OPPO,R819T,train
2,-1000643208750510000,M,29,,,,0,,,,...,,,,,,,,Gionee,GN137,train
3,-1001337759327040000,M,30,2774404,9.00,6-Sat,109,0.00,21.00,0.00,...,1.00,0.00,1.00,1.00,1.00,1.00,1.00,OPPO,A31,train
4,-1001337759327040000,M,30,3065018,10.00,3-Wed,109,0.00,,,...,,,,,,,,OPPO,A31,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266928,99976251796408100,M,43,839154,8.00,6-Sat,3,,13.00,0.00,...,1.00,0.00,1.00,1.00,1.00,1.00,1.00,others,Touch 2,test
1266929,99976251796408100,M,43,2642482,8.00,6-Sat,3,,11.00,0.00,...,1.00,0.00,1.00,1.00,1.00,1.00,1.00,others,Touch 2,test
1266930,99976251796408100,M,43,2642573,8.00,6-Sat,3,,13.00,0.00,...,1.00,0.00,1.00,1.00,1.00,1.00,1.00,others,Touch 2,test
1266931,999861742187156000,M,27,,,,0,,,,...,,,,,,,,Xiaomi,MI 2S,train


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266933 entries, 0 to 1266932
Data columns (total 23 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   device_id        1266933 non-null  object 
 1   gender           1266933 non-null  object 
 2   age              1266933 non-null  int8   
 3   event_id         1215598 non-null  object 
 4   hour             1215598 non-null  float64
 5   dayofweek        1215598 non-null  object 
 6   avg_events       1266933 non-null  int64  
 7   cluster_id       829077 non-null   float64
 8   is_active        556378 non-null   float64
 9   gaming           556378 non-null   float64
 10  financial        556378 non-null   float64
 11  travel           556378 non-null   float64
 12  technology       556378 non-null   float64
 13  family           556378 non-null   float64
 14  children         556378 non-null   float64
 15  education        556378 non-null   float64
 16  lifestyle        5

___We will use the following encoding for the gender and age class predictions:<br>
Gender : 'F'=0 and 'M'=1<br>
Age Group : [0-24]=0, [25-32]=1, [32+]=2___

In [4]:
%%time
# We will map / encode the train flag and target class columns

df_master['gender'] = df_master['gender'].map({'F':0, 'M':1})
df_master['gender'] = df_master['gender'].astype('category')

df_master['age'] = pd.cut(df_master['age'], bins=[0, 24, 32, 100], labels=[0, 1, 2])

df_master['train_test_flag'] = df_master['train_test_flag'].astype('category')

df_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266933 entries, 0 to 1266932
Data columns (total 23 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   device_id        1266933 non-null  object  
 1   gender           1266933 non-null  category
 2   age              1266933 non-null  category
 3   event_id         1215598 non-null  object  
 4   hour             1215598 non-null  float64 
 5   dayofweek        1215598 non-null  object  
 6   avg_events       1266933 non-null  int64   
 7   cluster_id       829077 non-null   float64 
 8   is_active        556378 non-null   float64 
 9   gaming           556378 non-null   float64 
 10  financial        556378 non-null   float64 
 11  travel           556378 non-null   float64 
 12  technology       556378 non-null   float64 
 13  family           556378 non-null   float64 
 14  children         556378 non-null   float64 
 15  education        556378 non-null   float64 
 16  

In [5]:
%%time
# We will create dummy features (one-hot encoded) for the common (to both Scenario1 & Scenario2) categorical variables

common_categorical_cols = ['phone_brand', 'device_model']
df_master[common_categorical_cols] = df_master[common_categorical_cols].astype('category')
dummy1 = pd.get_dummies(df_master[common_categorical_cols], drop_first=False)

# Adding the results to the master dataframe
df_master = pd.concat([df_master, dummy1], axis=1).drop(columns=common_categorical_cols)
display(df_master)
df_master.info()
del common_categorical_cols, dummy1

Unnamed: 0,device_id,gender,age,event_id,hour,dayofweek,avg_events,cluster_id,is_active,gaming,...,device_model_é­…è“2,device_model_é­…è“metal,device_model_éº¦èŠ’3,device_model_éº¦èŠ’3S,device_model_éº¦èŠ’4,device_model_é»„é‡‘æ–—å£«A8,device_model_é»„é‡‘æ–—å£«Note8,device_model_é”‹å°š,device_model_é”‹å°šPro,device_model_é‡‘é’¢
0,-1000369272589010000,0,1,,,,0,,,,...,0,0,0,0,0,0,0,0,0,0
1,-1000572055892390000,0,1,,,,0,,,,...,0,0,0,0,0,0,0,0,0,0
2,-1000643208750510000,1,1,,,,0,,,,...,0,0,0,0,0,0,0,0,0,0
3,-1001337759327040000,1,1,2774404,9.00,6-Sat,109,0.00,21.00,0.00,...,0,0,0,0,0,0,0,0,0,0
4,-1001337759327040000,1,1,3065018,10.00,3-Wed,109,0.00,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266928,99976251796408100,1,2,839154,8.00,6-Sat,3,,13.00,0.00,...,0,0,0,0,0,0,0,0,0,0
1266929,99976251796408100,1,2,2642482,8.00,6-Sat,3,,11.00,0.00,...,0,0,0,0,0,0,0,0,0,0
1266930,99976251796408100,1,2,2642573,8.00,6-Sat,3,,13.00,0.00,...,0,0,0,0,0,0,0,0,0,0
1266931,999861742187156000,1,1,,,,0,,,,...,0,0,0,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266933 entries, 0 to 1266932
Columns: 543 entries, device_id to device_model_é‡‘é’¢
dtypes: category(3), float64(14), int64(1), object(3), uint8(522)
memory usage: 808.3+ MB
Wall time: 10.1 s


### Segmenting the Data

#### Scenario 1 - All the data present (i.e. latitude-longitude data, application id data, event data and devices data)

In [6]:
%%time
# We will separate out the scenario 1 data from the master dataset

df_scn1 = df_master.copy().dropna()
display(df_scn1)
df_scn1.info()

Unnamed: 0,device_id,gender,age,event_id,hour,dayofweek,avg_events,cluster_id,is_active,gaming,...,device_model_é­…è“2,device_model_é­…è“metal,device_model_éº¦èŠ’3,device_model_éº¦èŠ’3S,device_model_éº¦èŠ’4,device_model_é»„é‡‘æ–—å£«A8,device_model_é»„é‡‘æ–—å£«Note8,device_model_é”‹å°š,device_model_é”‹å°šPro,device_model_é‡‘é’¢
3,-1001337759327040000,1,1,2774404,9.00,6-Sat,109,0.00,21.00,0.00,...,0,0,0,0,0,0,0,0,0,0
6,-1001337759327040000,1,1,2906128,10.00,6-Sat,109,0.00,48.00,0.00,...,0,0,0,0,0,0,0,0,0,0
7,-1001337759327040000,1,1,2876843,10.00,6-Sat,109,0.00,22.00,0.00,...,0,0,0,0,0,0,0,0,0,0
10,-1001337759327040000,1,1,3141167,9.00,6-Sat,109,0.00,40.00,0.00,...,0,0,0,0,0,0,0,0,0,0
12,-1001337759327040000,1,1,2647428,9.00,6-Sat,109,0.00,40.00,0.00,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266912,999208698621622000,1,1,2892952,22.00,6-Sat,37,39.00,5.00,0.00,...,0,0,0,0,0,0,0,0,0,0
1266915,999208698621622000,1,1,2892302,22.00,3-Wed,37,39.00,16.00,0.00,...,0,0,0,0,0,0,0,0,0,0
1266919,999208698621622000,1,1,27998,23.00,1-Mon,37,39.00,8.00,0.00,...,0,0,0,0,0,0,0,0,0,0
1266920,999208698621622000,1,1,603552,12.00,2-Tue,37,39.00,8.00,0.00,...,0,0,0,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 193199 entries, 3 to 1266923
Columns: 543 entries, device_id to device_model_é‡‘é’¢
dtypes: category(3), float64(14), int64(1), object(3), uint8(522)
memory usage: 124.7+ MB
Wall time: 3.23 s


In [7]:
%%time
# We will create dummy features (one-hot encoded) for the Scenario1 specific categorical variables

scn1_categorical_cols = ['hour', 'dayofweek', 'cluster_id']
df_scn1[scn1_categorical_cols] = df_scn1[scn1_categorical_cols].astype('category')
dummy1 = pd.get_dummies(df_scn1[scn1_categorical_cols], drop_first=False)

# Adding the results to the master dataframe
df_scn1 = pd.concat([df_scn1, dummy1], axis=1).drop(columns=scn1_categorical_cols)
display(df_scn1)
df_scn1.info()
del scn1_categorical_cols, dummy1

Unnamed: 0,device_id,gender,age,event_id,avg_events,is_active,gaming,financial,travel,technology,...,cluster_id_116.0,cluster_id_117.0,cluster_id_118.0,cluster_id_119.0,cluster_id_120.0,cluster_id_121.0,cluster_id_122.0,cluster_id_123.0,cluster_id_124.0,cluster_id_125.0
3,-1001337759327040000,1,1,2774404,109,21.00,0.00,1.00,1.00,1.00,...,0,0,0,0,0,0,0,0,0,0
6,-1001337759327040000,1,1,2906128,109,48.00,0.00,1.00,1.00,1.00,...,0,0,0,0,0,0,0,0,0,0
7,-1001337759327040000,1,1,2876843,109,22.00,0.00,1.00,1.00,1.00,...,0,0,0,0,0,0,0,0,0,0
10,-1001337759327040000,1,1,3141167,109,40.00,0.00,1.00,1.00,1.00,...,0,0,0,0,0,0,0,0,0,0
12,-1001337759327040000,1,1,2647428,109,40.00,0.00,1.00,1.00,1.00,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1266912,999208698621622000,1,1,2892952,37,5.00,0.00,1.00,1.00,1.00,...,0,0,0,0,0,0,0,0,0,0
1266915,999208698621622000,1,1,2892302,37,16.00,0.00,1.00,1.00,1.00,...,0,0,0,0,0,0,0,0,0,0
1266919,999208698621622000,1,1,27998,37,8.00,0.00,1.00,1.00,1.00,...,0,0,0,0,0,0,0,0,0,0
1266920,999208698621622000,1,1,603552,37,8.00,0.00,1.00,0.00,0.00,...,0,0,0,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 193199 entries, 3 to 1266923
Columns: 698 entries, device_id to cluster_id_125.0
dtypes: category(3), float64(12), int64(1), object(2), uint8(680)
memory usage: 149.4+ MB
Wall time: 798 ms


In [8]:
# We will prepare the various columns to be aggregated over groupby 'device_id' column

all_cols = list(df_scn1.columns)
cols_to_keep_first = ['gender', 'age', 'avg_events', 'train_test_flag']
cols_to_sum_up = ['is_active', 'gaming', 'financial', 'travel', 'technology', 'family',
                  'children', 'education', 'lifestyle','social','services','others']

for col in all_cols:
    if (col.startswith('phone_brand') or col.startswith('device_model')):
        cols_to_keep_first.append(col)
    if (col.startswith('hour') or col.startswith('dayofweek') or col.startswith('cluster_id')):
        cols_to_sum_up.append(col)

print('cols_to_keep_first =', cols_to_keep_first)
print('cols_to_sum_up =', cols_to_sum_up)
del all_cols

cols_to_keep_first = ['gender', 'age', 'avg_events', 'train_test_flag', 'phone_brand_Coolpad', 'phone_brand_Gionee', 'phone_brand_HTC', 'phone_brand_Huawei', 'phone_brand_LG', 'phone_brand_Meizu', 'phone_brand_OPPO', 'phone_brand_Sony', 'phone_brand_TCL', 'phone_brand_Xiaomi', 'phone_brand_ZTE', 'phone_brand_ccmc', 'phone_brand_dowe', 'phone_brand_hisense', 'phone_brand_lenovo', 'phone_brand_lshi', 'phone_brand_nubia', 'phone_brand_others', 'phone_brand_samsung', 'phone_brand_vivo', 'phone_brand_youmi', 'device_model_1105', 'device_model_1107', 'device_model_2', 'device_model_2016GalAxyA7', 'device_model_2016GalAxyA9', 'device_model_3', 'device_model_3007', 'device_model_45rggt2', 'device_model_5263', 'device_model_5890', 'device_model_5891', 'device_model_5891Q', 'device_model_5892', 'device_model_5950', 'device_model_5951', 'device_model_6607', 'device_model_7269', 'device_model_7270', 'device_model_7295', 'device_model_7295+', 'device_model_7295A123', 'device_model_7295C', 'device_m

In [9]:
%%time
# We will prepare the aggregation dictionary for the various columns

agg_dict = {}

for col in cols_to_keep_first:
    agg_dict[col] = 'first'

for col in cols_to_sum_up:
    agg_dict[col] = np.sum

print(agg_dict)
del col, cols_to_keep_first, cols_to_sum_up

{'gender': 'first', 'age': 'first', 'avg_events': 'first', 'train_test_flag': 'first', 'phone_brand_Coolpad': 'first', 'phone_brand_Gionee': 'first', 'phone_brand_HTC': 'first', 'phone_brand_Huawei': 'first', 'phone_brand_LG': 'first', 'phone_brand_Meizu': 'first', 'phone_brand_OPPO': 'first', 'phone_brand_Sony': 'first', 'phone_brand_TCL': 'first', 'phone_brand_Xiaomi': 'first', 'phone_brand_ZTE': 'first', 'phone_brand_ccmc': 'first', 'phone_brand_dowe': 'first', 'phone_brand_hisense': 'first', 'phone_brand_lenovo': 'first', 'phone_brand_lshi': 'first', 'phone_brand_nubia': 'first', 'phone_brand_others': 'first', 'phone_brand_samsung': 'first', 'phone_brand_vivo': 'first', 'phone_brand_youmi': 'first', 'device_model_1105': 'first', 'device_model_1107': 'first', 'device_model_2': 'first', 'device_model_2016GalAxyA7': 'first', 'device_model_2016GalAxyA9': 'first', 'device_model_3': 'first', 'device_model_3007': 'first', 'device_model_45rggt2': 'first', 'device_model_5263': 'first', 'dev

In [10]:
%%time
# We will aggregate the keep_first and sum_up columns over groupby 'device_id' column

df_scn1 = df_scn1.drop(columns=['event_id']).groupby('device_id').agg(agg_dict).reset_index()
display(df_scn1)
df_scn1.info()
del agg_dict

Unnamed: 0,device_id,gender,age,avg_events,train_test_flag,phone_brand_Coolpad,phone_brand_Gionee,phone_brand_HTC,phone_brand_Huawei,phone_brand_LG,...,cluster_id_116.0,cluster_id_117.0,cluster_id_118.0,cluster_id_119.0,cluster_id_120.0,cluster_id_121.0,cluster_id_122.0,cluster_id_123.0,cluster_id_124.0,cluster_id_125.0
0,-1001337759327040000,1,1,109,train,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1002733576670970000,1,2,55,train,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,-1005411102947240000,1,2,44,train,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-1006357788560220000,1,2,51,train,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-1010331399860270000,1,1,4,train,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11203,990879148135067000,0,1,4,train,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11204,992109739203134000,0,2,251,train,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11205,998208026013018000,0,1,71,train,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11206,998402647311351000,1,0,4,train,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11208 entries, 0 to 11207
Columns: 697 entries, device_id to cluster_id_125.0
dtypes: float64(36), int64(3), object(2), uint8(656)
memory usage: 10.5+ MB
Wall time: 10.3 s


In [11]:
%%time
# Finally we will persist the Scenario1 dataset to local storage for retrieval and processing in subsequent tasks

df_scn1.to_csv('df_scn1.csv', header=True, index=False)
del df_scn1

Wall time: 2.42 s


#### Scenario 2 - Only mobile phone, brand and device data available

In [12]:
%%time
# We will separate out the scenario 2 data from the master dataset

df_scn2 = df_master[df_master.event_id.isna()].copy().dropna(axis=1).drop(columns=['avg_events']).reset_index(drop=True)
display(df_scn2)
df_scn2.info()

Unnamed: 0,device_id,gender,age,train_test_flag,phone_brand_Coolpad,phone_brand_Gionee,phone_brand_HTC,phone_brand_Huawei,phone_brand_LG,phone_brand_Meizu,...,device_model_é­…è“2,device_model_é­…è“metal,device_model_éº¦èŠ’3,device_model_éº¦èŠ’3S,device_model_éº¦èŠ’4,device_model_é»„é‡‘æ–—å£«A8,device_model_é»„é‡‘æ–—å£«Note8,device_model_é”‹å°š,device_model_é”‹å°šPro,device_model_é‡‘é’¢
0,-1000369272589010000,0,1,test,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1000572055892390000,0,1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1000643208750510000,1,1,train,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-1001949518704260000,1,0,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-1002079624347530000,0,2,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51330,998997036709813000,0,1,train,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
51331,999356919477646000,1,1,test,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
51332,999529955917823000,0,0,train,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51333,999861742187156000,1,1,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51335 entries, 0 to 51334
Columns: 526 entries, device_id to device_model_é‡‘é’¢
dtypes: category(3), object(1), uint8(522)
memory usage: 26.1+ MB
Wall time: 465 ms


In [13]:
%%time
# Finally we will persist the Scenario2 dataset to local storage for retrieval and processing in subsequent tasks

df_scn2.to_csv('df_scn2.csv', header=True, index=False)
del df_master, df_scn2

Wall time: 4.44 s


___With this we have reached the end of the Scenario1 & Scenario2 Data Segmentation stage. Model Building for Scenario1 & Scenario2 will be taken up in the subsequent stages.___