In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import datetime
import time
import swifter

# Data Loading

In [2]:
AppLaunched=pd.read_csv('./Data/AppLaunched.csv',sep=',')
VideoDetails=pd.read_csv('./Data/VideoDetails.csv',sep=',')
UTMVisited=pd.read_csv('./Data/UTMVisited.csv',sep=',')
Registration=pd.read_csv('./Data/Registration.csv',sep=',')
AppUninstalled=pd.read_csv('./Data/AppUninstalled.csv',sep=',')
VideoStarted=pd.read_csv('./Data/VideoStarted.csv',sep=',')

# About the datasets:
Every event contains the following : 
1. UserId: Every app user is assigned a unique identity which is stored in this field. 
2. Date: The date on which this event was raised 
3. Minute_Of_Day: The minute of that day on which the event was raised 
4. Second: The second of the minute on which the event was raised. Date, Minute_of_Day and Second provides the exact timestamp of the event.            
5. Country: The country Id in which the user was present while doing the event. Note that 255 means "Unknown" country.             
6. State: The state Id of that country. For example, India will have state Ids in the range [1,29]. 
7. OS: The OS of the device from which event was raised. They are coded as : 
    0. : Others, 
    1. : Android,
    2. : iOS,
    3. : Windows,
    4. : Mac,
    5. : BlackBerry,
    6. : Linux 
8. Device: The type of device. They are coded as : 
    0. : Desktop,
    1. : Mobile,
    2. : Tablet,
    3. : TV
The events might also have some custom properties : 
1. VideoStarted and VideoDetails : 
    1. Genre : The genre of the video 
    2. ProgramType : Records the type - TVShow or Movie etc. 
    3. Category : Records the category - Video on demand("vod") or not 
    4. VideoId : The video name 
2. Registered : 
    1. Status: The status of the registration 


# Registration

In [5]:
Registration.head()

Unnamed: 0,UserId,Date,Minute_Of_Day,Second,State,Country,Device,OS,Status
0,d168e2b924124feaad13eae44a68fce9,20170106,0,8,35,1,1,2,Complete
1,deed7490952a6746610ee2ee9fd219b7,20170106,0,8,7,40,1,1,Completed
2,13d9f55e297d7d9c219fbe9085e1006c,20170106,0,28,35,1,1,2,Complete
3,fccc2c0abf5afabf242a70df7b4c9e69,20170106,0,52,39,1,1,1,Completed
4,9ff982b29d78cefc8eb5a3ffb7fd2a5d,20170106,0,52,38,1,1,1,Not Completed


In [6]:
Registration.shape

(329579, 9)

# App Launched

In [3]:
AppLaunched.head()

Unnamed: 0,UserId,Date,Minute_Of_Day,Second,State,Country,Device,OS
0,a27134c322d4f56dabc106a6847cb96b,20170106,0,4,50,1,1,2
1,aab39ad0874c59cc388525d511667f9d,20170106,0,8,40,1,1,2
2,f717dc3f757dc18cc0c3f27ed3e94ef8,20170106,0,12,35,1,1,2
3,04c1e14fd54a06c0d4fe98cf0cb8b04c,20170106,0,20,40,1,2,2
4,71f05822bb699a04b694dd30265fdf73,20170106,0,20,35,1,1,2


In [8]:
AppLaunched.shape

(2716093, 8)

# UTMVisited

In [7]:
UTMVisited.head()

Unnamed: 0,UserId,Date,Minute_Of_Day,Second,State,Country,Device,OS
0,66d4abf4f3b733c27591b3f47c85eff7,20170106,0,52,37,1,1,1
1,1a40d7e5b34131b097fac80cf982241b,20170106,0,56,40,1,1,1
2,b7b3c1ce773a13b52d39429e305ea615,20170106,1,16,35,1,1,1
3,3f486a89433a02b96ab13b16c82e39fd,20170106,1,32,35,1,1,1
4,9ff982b29d78cefc8eb5a3ffb7fd2a5d,20170106,1,32,38,1,1,1


In [9]:
UTMVisited.shape

(495850, 8)

# VideoStarted

In [10]:
VideoStarted.head()

Unnamed: 0,UserId,Date,Minute_Of_Day,Second,State,Genre,Category,ProgramType,Country,Device,OS,VideoId
0,0280dfdd112732a3ac12b12dc770b7af,20170106,0,8,35,Romance,vod,Movies,1,1,2,36a27b379622f342ec87f9aafadb8f94
1,435d41ae019cb8db785483793859c9a8,20170106,0,28,35,Anime,vod,TV Shows,1,1,1,a4d9b88c7ed63d723c70b358a857719c
2,1faf0ce0b98e02e1568702f516f01a78,20170106,0,36,35,Drama,vod,TV Shows,1,1,1,806660cb47633263a24bbc53238a9a53
3,3ec691b9d2b5d53ef965fe59b1900b30,20170106,0,48,35,Drama,vod,TV Shows,1,2,2,385114825a85d6878e7a0978f9ba5546
4,ff230d487a7139b65f33b54a4cbd2d9e,20170106,0,56,35,Comedy,vod,Movies,1,1,1,368a480ec0ae105aee8320dd93483e39


In [11]:
VideoStarted.shape

(1796525, 12)

# VideoDetails

In [13]:
VideoDetails.head()

Unnamed: 0,UserId,Date,Minute_Of_Day,Second,State,Genre,Category,ProgramType,Country,Device,OS,VideoId
0,dc93d200565e5bf6e6fc55adb1c5ba06,20170106,0,4,35,Drama,vod,TV Shows,1,2,1,1f2dc23e0d2415353982cfa38cbaf983
1,12060efd8b68ea562b265d0129a8af1f,20170106,0,16,35,Horror,vod,Movies,1,1,1,6c95441f239cebf9930ac5932cc84a7f
2,9151ab2a901458f27e36548d4a8dc011,20170106,0,20,0,Drama,vod,TV Shows,255,1,1,1af000794c63fa2722e22af609ff572c
3,1faf0ce0b98e02e1568702f516f01a78,20170106,0,20,35,Drama,vod,TV Shows,1,1,1,f053c42e98d50e06532ce8d65afb3ab3
4,435d41ae019cb8db785483793859c9a8,20170106,0,20,35,Anime,vod,TV Shows,1,1,1,2a9d984659afc14f801548dd71ddad0e


In [12]:
VideoDetails.shape

(3695451, 12)

# AppUninstalled

In [14]:
AppUninstalled.head()

Unnamed: 0,UserId,Date,Minute_Of_Day,Second,State,Country,Device,OS
0,0d5f4cc176a4e1c648a9b9b5dbe21d69,20170106,0,0,47,1,1,1
1,807018f66aba113be576eb8afa4ea4b6,20170106,0,0,6,1,1,1
2,763e66e04918d8b8a7e69a778f5a742a,20170106,1,0,41,1,1,1
3,4ee1ed456fa04fc42168a9787b54b549,20170106,1,0,37,1,1,1
4,253e758a5557f9de890e814d901f6906,20170106,6,0,46,40,1,1


In [15]:
AppUninstalled.shape

(326684, 8)

# Observations:
1. Shape and features of all the datasets are different. 
2. The data is raw and we need to engineer features out of this raw data.
3. Combining the time related features into one could reduce the dimensionality.
4. The target variable has to be featured out of features that is going to be derived from the raw data.
 