In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import datetime
import time
import swifter

# Data Loading

In [2]:
AppLaunchedMod = pd.read_csv('./data_modded/AppLaunchedMod.csv',sep=',')
UTMVisitedMod = pd.read_csv('./data_modded/UTMVisitedMod.csv',sep=',')
RegistrationMod = pd.read_csv('./data_modded/RegistrationMod.csv',sep=',')
AppUninstalledMod = pd.read_csv('./data_modded/AppUninstalledMod.csv',sep=',')

# Check the shape of every dataset before combining them.

In [4]:
print("AppLaunchedMod:",AppLaunchedMod.shape)
print("UTMVisitedMod:",UTMVisitedMod.shape)
print("RegistrationMod:",RegistrationMod.shape)
print("AppUninstalledMod:",AppUninstalledMod.shape)

AppLaunchedMod: (2716093, 6)
UTMVisitedMod: (495850, 6)
RegistrationMod: (329579, 7)
AppUninstalledMod: (326684, 6)


# Feature Engineering App Launch

In [5]:
AppLaunchedMod['Event'] = "App Launched"
user_frequency = AppLaunchedMod['UserId'].value_counts()
AppLaunchedMod = AppLaunchedMod.merge(user_frequency,how='inner',left_on='UserId',right_index=True)
AppLaunchedMod.head()

Unnamed: 0,UserId,UserId_x,State,Country,Device,OS,DateTime,Event,UserId_y
0,a27134c322d4f56dabc106a6847cb96b,a27134c322d4f56dabc106a6847cb96b,50,1,1,2,2017-01-06 00:00:04,App Launched,342
13,a27134c322d4f56dabc106a6847cb96b,a27134c322d4f56dabc106a6847cb96b,50,1,1,2,2017-01-06 00:01:04,App Launched,342
25,a27134c322d4f56dabc106a6847cb96b,a27134c322d4f56dabc106a6847cb96b,50,1,1,2,2017-01-06 00:02:08,App Launched,342
41,a27134c322d4f56dabc106a6847cb96b,a27134c322d4f56dabc106a6847cb96b,50,1,1,2,2017-01-06 00:03:12,App Launched,342
52,a27134c322d4f56dabc106a6847cb96b,a27134c322d4f56dabc106a6847cb96b,50,1,1,2,2017-01-06 00:04:12,App Launched,342


In [6]:
AppLaunchedMod.drop(['UserId_x'],axis=1,inplace=True)
AppLaunchedMod.rename({'UserId_y':'EventFrequency'},axis='columns',inplace=True)

In [7]:
AppLaunchedMod.head()

Unnamed: 0,UserId,State,Country,Device,OS,DateTime,Event,EventFrequency
0,a27134c322d4f56dabc106a6847cb96b,50,1,1,2,2017-01-06 00:00:04,App Launched,342
13,a27134c322d4f56dabc106a6847cb96b,50,1,1,2,2017-01-06 00:01:04,App Launched,342
25,a27134c322d4f56dabc106a6847cb96b,50,1,1,2,2017-01-06 00:02:08,App Launched,342
41,a27134c322d4f56dabc106a6847cb96b,50,1,1,2,2017-01-06 00:03:12,App Launched,342
52,a27134c322d4f56dabc106a6847cb96b,50,1,1,2,2017-01-06 00:04:12,App Launched,342


In [8]:
AppLaunchedMod.isnull().sum()

UserId            0
State             0
Country           0
Device            0
OS                0
DateTime          0
Event             0
EventFrequency    0
dtype: int64

# Feature Engineering Registration

In [9]:
RegistrationMod['Event'] = "Registration"
user_frequency = RegistrationMod['UserId'].value_counts()
RegistrationMod = RegistrationMod.merge(user_frequency,how='inner',left_on='UserId',right_index=True)
RegistrationMod.head()

Unnamed: 0,UserId,UserId_x,State,Country,Device,OS,Status,DateTime,Event,UserId_y
0,d168e2b924124feaad13eae44a68fce9,d168e2b924124feaad13eae44a68fce9,35,1,1,2,Complete,2017-01-06 00:00:08,Registration,1
1,deed7490952a6746610ee2ee9fd219b7,deed7490952a6746610ee2ee9fd219b7,7,40,1,1,Completed,2017-01-06 00:00:08,Registration,1
2,13d9f55e297d7d9c219fbe9085e1006c,13d9f55e297d7d9c219fbe9085e1006c,35,1,1,2,Complete,2017-01-06 00:00:28,Registration,1
3,fccc2c0abf5afabf242a70df7b4c9e69,fccc2c0abf5afabf242a70df7b4c9e69,39,1,1,1,Completed,2017-01-06 00:00:52,Registration,1
4,9ff982b29d78cefc8eb5a3ffb7fd2a5d,9ff982b29d78cefc8eb5a3ffb7fd2a5d,38,1,1,1,Not Completed,2017-01-06 00:00:52,Registration,2


In [10]:
RegistrationMod.drop(['UserId_x'],axis=1,inplace=True)
RegistrationMod.rename({'UserId_y':'EventFrequency'},axis='columns',inplace=True)

In [11]:
RegistrationMod.head()

Unnamed: 0,UserId,State,Country,Device,OS,Status,DateTime,Event,EventFrequency
0,d168e2b924124feaad13eae44a68fce9,35,1,1,2,Complete,2017-01-06 00:00:08,Registration,1
1,deed7490952a6746610ee2ee9fd219b7,7,40,1,1,Completed,2017-01-06 00:00:08,Registration,1
2,13d9f55e297d7d9c219fbe9085e1006c,35,1,1,2,Complete,2017-01-06 00:00:28,Registration,1
3,fccc2c0abf5afabf242a70df7b4c9e69,39,1,1,1,Completed,2017-01-06 00:00:52,Registration,1
4,9ff982b29d78cefc8eb5a3ffb7fd2a5d,38,1,1,1,Not Completed,2017-01-06 00:00:52,Registration,2


In [12]:
RegistrationMod.drop(['Status'],axis=1,inplace=True)
RegistrationMod.head()

Unnamed: 0,UserId,State,Country,Device,OS,DateTime,Event,EventFrequency
0,d168e2b924124feaad13eae44a68fce9,35,1,1,2,2017-01-06 00:00:08,Registration,1
1,deed7490952a6746610ee2ee9fd219b7,7,40,1,1,2017-01-06 00:00:08,Registration,1
2,13d9f55e297d7d9c219fbe9085e1006c,35,1,1,2,2017-01-06 00:00:28,Registration,1
3,fccc2c0abf5afabf242a70df7b4c9e69,39,1,1,1,2017-01-06 00:00:52,Registration,1
4,9ff982b29d78cefc8eb5a3ffb7fd2a5d,38,1,1,1,2017-01-06 00:00:52,Registration,2


# Feature Engineering Utm visited

In [13]:
UTMVisitedMod['Event'] = "UTM Visited"
user_frequency = UTMVisitedMod['UserId'].value_counts()
UTMVisitedMod = UTMVisitedMod.merge(user_frequency,how='inner',left_on='UserId',right_index=True)
UTMVisitedMod.head()

Unnamed: 0,UserId,UserId_x,State,Country,Device,OS,DateTime,Event,UserId_y
0,66d4abf4f3b733c27591b3f47c85eff7,66d4abf4f3b733c27591b3f47c85eff7,37,1,1,1,2017-01-06 00:00:52,UTM Visited,2
25017,66d4abf4f3b733c27591b3f47c85eff7,66d4abf4f3b733c27591b3f47c85eff7,37,1,1,1,2017-01-15 17:06:16,UTM Visited,2
1,1a40d7e5b34131b097fac80cf982241b,1a40d7e5b34131b097fac80cf982241b,40,1,1,1,2017-01-06 00:00:56,UTM Visited,10
26766,1a40d7e5b34131b097fac80cf982241b,1a40d7e5b34131b097fac80cf982241b,40,1,1,1,2017-01-16 10:13:56,UTM Visited,10
27206,1a40d7e5b34131b097fac80cf982241b,1a40d7e5b34131b097fac80cf982241b,40,1,1,1,2017-01-16 12:33:28,UTM Visited,10


In [14]:
UTMVisitedMod.drop(['UserId_x'],axis=1,inplace=True)
UTMVisitedMod.rename({'UserId_y':'EventFrequency'},axis='columns',inplace=True)

In [15]:
UTMVisitedMod.head()

Unnamed: 0,UserId,State,Country,Device,OS,DateTime,Event,EventFrequency
0,66d4abf4f3b733c27591b3f47c85eff7,37,1,1,1,2017-01-06 00:00:52,UTM Visited,2
25017,66d4abf4f3b733c27591b3f47c85eff7,37,1,1,1,2017-01-15 17:06:16,UTM Visited,2
1,1a40d7e5b34131b097fac80cf982241b,40,1,1,1,2017-01-06 00:00:56,UTM Visited,10
26766,1a40d7e5b34131b097fac80cf982241b,40,1,1,1,2017-01-16 10:13:56,UTM Visited,10
27206,1a40d7e5b34131b097fac80cf982241b,40,1,1,1,2017-01-16 12:33:28,UTM Visited,10


In [16]:
UTMVisitedMod.isnull().sum()

UserId            0
State             0
Country           0
Device            0
OS                0
DateTime          0
Event             0
EventFrequency    0
dtype: int64

# Feature Engineering App Uninstall

In [17]:
AppUninstalledMod['Event'] = "App Uninstalled"
user_frequency = AppUninstalledMod['UserId'].value_counts()
AppUninstalledMod = AppUninstalledMod.merge(user_frequency,how='inner',left_on='UserId',right_index=True)
AppUninstalledMod.head()

Unnamed: 0,UserId,UserId_x,State,Country,Device,OS,DateTime,Event,UserId_y
0,0d5f4cc176a4e1c648a9b9b5dbe21d69,0d5f4cc176a4e1c648a9b9b5dbe21d69,47,1,1,1,2017-01-06 00:00:00,App Uninstalled,5
549,0d5f4cc176a4e1c648a9b9b5dbe21d69,0d5f4cc176a4e1c648a9b9b5dbe21d69,47,1,1,1,2017-01-06 02:58:00,App Uninstalled,5
199441,0d5f4cc176a4e1c648a9b9b5dbe21d69,0d5f4cc176a4e1c648a9b9b5dbe21d69,47,1,1,1,2017-04-22 02:04:00,App Uninstalled,5
262026,0d5f4cc176a4e1c648a9b9b5dbe21d69,0d5f4cc176a4e1c648a9b9b5dbe21d69,47,1,1,1,2017-05-17 02:34:00,App Uninstalled,5
291982,0d5f4cc176a4e1c648a9b9b5dbe21d69,0d5f4cc176a4e1c648a9b9b5dbe21d69,47,1,1,1,2017-05-30 02:28:00,App Uninstalled,5


In [18]:
AppUninstalledMod.drop(['UserId_x'],axis=1,inplace=True)
AppUninstalledMod.rename({'UserId_y':'EventFrequency'},axis='columns',inplace=True)

In [19]:
AppUninstalledMod.head()

Unnamed: 0,UserId,State,Country,Device,OS,DateTime,Event,EventFrequency
0,0d5f4cc176a4e1c648a9b9b5dbe21d69,47,1,1,1,2017-01-06 00:00:00,App Uninstalled,5
549,0d5f4cc176a4e1c648a9b9b5dbe21d69,47,1,1,1,2017-01-06 02:58:00,App Uninstalled,5
199441,0d5f4cc176a4e1c648a9b9b5dbe21d69,47,1,1,1,2017-04-22 02:04:00,App Uninstalled,5
262026,0d5f4cc176a4e1c648a9b9b5dbe21d69,47,1,1,1,2017-05-17 02:34:00,App Uninstalled,5
291982,0d5f4cc176a4e1c648a9b9b5dbe21d69,47,1,1,1,2017-05-30 02:28:00,App Uninstalled,5


In [20]:
print("AppLaunchedMod:",AppLaunchedMod.shape)
print("UTMVisitedMod:",UTMVisitedMod.shape)
print("RegistrationMod:",RegistrationMod.shape)
print("AppUninstalledMod:",AppUninstalledMod.shape)

AppLaunchedMod: (2716093, 8)
UTMVisitedMod: (495850, 8)
RegistrationMod: (329579, 8)
AppUninstalledMod: (326684, 8)


# Merging the above Events

In [21]:
final_data_1 = pd.DataFrame()

In [22]:
final_data_1 = final_data_1.append(RegistrationMod)
final_data_1.shape

(329579, 8)

In [23]:
final_data_1 = final_data_1.append(AppLaunchedMod)
final_data_1.shape

(3045672, 8)

In [24]:
final_data_1 = final_data_1.append(UTMVisitedMod)
final_data_1.shape

(3541522, 8)

In [25]:
final_data_1 = final_data_1.append(AppUninstalledMod)
final_data_1.shapeb

(3868206, 8)

In [26]:
final_data_1.head()

Unnamed: 0,UserId,State,Country,Device,OS,DateTime,Event,EventFrequency
0,d168e2b924124feaad13eae44a68fce9,35,1,1,2,2017-01-06 00:00:08,Registration,1
1,deed7490952a6746610ee2ee9fd219b7,7,40,1,1,2017-01-06 00:00:08,Registration,1
2,13d9f55e297d7d9c219fbe9085e1006c,35,1,1,2,2017-01-06 00:00:28,Registration,1
3,fccc2c0abf5afabf242a70df7b4c9e69,39,1,1,1,2017-01-06 00:00:52,Registration,1
4,9ff982b29d78cefc8eb5a3ffb7fd2a5d,38,1,1,1,2017-01-06 00:00:52,Registration,2


In [27]:
final_data_1.sort_values(['UserId','DateTime'],axis=0,inplace=True)

In [28]:
final_data_1.head()

Unnamed: 0,UserId,State,Country,Device,OS,DateTime,Event,EventFrequency
68491,0000146e97c32d369268e5ba5f4b907c,36,1,1,1,2017-02-21 10:36:36,Registration,1
658199,0000146e97c32d369268e5ba5f4b907c,36,1,1,1,2017-02-21 10:36:36,App Launched,14
658809,0000146e97c32d369268e5ba5f4b907c,36,1,1,1,2017-02-21 12:16:20,App Launched,14
661623,0000146e97c32d369268e5ba5f4b907c,36,1,1,1,2017-02-21 18:16:28,App Launched,14
145813,0000146e97c32d369268e5ba5f4b907c,36,1,1,1,2017-02-21 18:16:28,UTM Visited,4


In [29]:
final_data_1.to_csv(path_or_buf='final_data_1.csv',sep=',',index=False)