In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
train = pd.read_csv("../input/daily-phone-usage/train.csv")
test = pd.read_csv("../input/daily-phone-usage/test.csv")
# Analyzing User activity using playstore dataset
playstore = pd.read_csv("../input/google-play-store-apps/googleplaystore.csv")

In [None]:
# removing non_ascii values
non_ascii_values = ['WhatsApp\xa0Business','வரைபடம்']
train = train[~train['App'].isin(non_ascii_values)]

In [None]:
app_list = [x.lower() for x in train['App'].value_counts().index.tolist()]

Analyzing the apps only available in playstore dataset

In [None]:
ava_in_dataset = playstore[(playstore['App'].str.lower().isin(app_list) ) ]
ava_in_dataset.drop_duplicates(subset ="App",keep = 'first', inplace = True) 

In [None]:
train.dropna(inplace=True)

In [None]:
train['DateTime']= pd.to_datetime(train['Date'] +" " +train['Time'],format='%m/%d/%Y %H:%M:%S')
train['Date']= pd.to_datetime(train['Date'],format='%m/%d/%Y')

In [None]:
test['Date']= pd.to_datetime(test['Date'],format='%m/%d/%Y')

In [None]:
train = train.sort_values(['DateTime'])

In [None]:
train.head()

No. of days data was given

In [None]:
train['DateTime'].max() - train['DateTime'].min()

Removing unwanted activities

In [None]:
system_tracker = ['Screen on (unlocked)','Screen off (locked)','Screen on (locked)', 'Screen off','Permission controller','System UI','Package installer',
'Device shutdown','Call Management']
service_app = train[train['App'].isin(system_tracker)]
train = train[~train['App'].isin(system_tracker)]
test = test[~test['App'].isin(system_tracker)]

In [None]:
service_app['App'].value_counts()

Converting Duration into Seconds

In [None]:
train['TotalSeconds']=train['Duration'].str.split(':').apply(lambda x: int(x[0]) * 3600 + int(x[1])*60 + int(x[2]))

In [None]:
train.groupby('App').sum().nlargest(20,'TotalSeconds').reset_index()

In [None]:
plt.figure(figsize=(15,6))
data = train.groupby('App').sum().nlargest(20,'TotalSeconds').reset_index()
sns.barplot(x='App',y='TotalSeconds',data=data)
plt.title('Top 20 apps used')
plt.xticks(rotation=90)
plt.show()

Let's analyse the app that are available in the playstore dataset

In [None]:
train['appl'] = train['App'].str.lower()
ava_in_dataset['appl'] = ava_in_dataset['App'].str.lower()

In [None]:
new = train.merge(ava_in_dataset, left_on='appl', right_on='appl')
new2 = new.drop_duplicates(subset='appl',keep='first')

In [None]:
new2 = new2.merge(new.groupby(['appl']).sum().reset_index(),left_on='appl', right_on='appl')

In [None]:
print("No. of apps that are available in playstore dataset {}".format(new2.shape[0]))

In [None]:
new2['TotalHours'] = new2['TotalSeconds_y'] // 3600

In [None]:
plt.figure(figsize=(15,6))
sns.barplot(x='App_x',y='TotalHours',data=new2)
plt.title('Total no. of hours each app was used')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(15,6))
category = new2['Genres'].value_counts()
sns.barplot(x=category.index,y=category.values)
plt.title('Genres of the app')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.barplot(x='App_x',y='TotalHours',data=new2)
plt.title('Countries which has Confirmed cases')
plt.xticks(rotation=90)
plt.show()

User spends most of the times on Instagram

In [None]:
app_type = new2['Type'].value_counts()
sns.barplot(x=app_type.index,y=app_type.values)
plt.title('Type of apps')
plt.xticks(rotation=90)
plt.show()

User likes free apps

In [None]:
print("mean of ratings {}".format(round(np.mean(new2['Rating_x']),2)))
print("mode of ratings {}".format(stats.mode(new2['Rating_x'][0])))
print("It seems User is interested in installing the apps which has rating greater than 4.2")

In [None]:
train = train.groupby(['Date','App']).sum().reset_index()
train['TotalMinutes'] = train['TotalSeconds'] // 60

In [None]:
def dateFeatures(df):
    features = ['day','week','dayofweek','month','quarter','year','dayofyear','weekofyear']
    for col in features:
        df[col] = getattr(df['Date'].dt,col) * 1

In [None]:
dateFeatures(train)
dateFeatures(test)

In [None]:
train.head()

In [None]:
plt.figure(figsize=(15,6))
train.groupby(['month','App']).sum().nlargest(6,'TotalMinutes')['TotalMinutes'].plot(kind='bar')
plt.title('User spends more minutes on the app for each month')
plt.xticks(rotation=90)
plt.show()

In [None]:
train.groupby(['month'])['TotalMinutes'].sum().plot()

5th month has lesser data

In [None]:
train.groupby(['Date'])['TotalMinutes'].sum().plot()

In [None]:
print("The day when User spends least amount of time {}".format(train.groupby(['Date'])['TotalMinutes'].sum().nsmallest(1)))

In [None]:
print("The day when User spends highest amount of time {}".format(train.groupby(['Date'])['TotalMinutes'].sum().nlargest(1)))


In [None]:
train.head()

In [None]:
train.groupby(['week'])['TotalMinutes'].sum().plot()

In [None]:
train.groupby(['dayofweek'])['TotalMinutes'].sum().plot()

In [None]:
train.drop(['Date','TotalSeconds'],axis=1,inplace=True)
# test.drop(['Date','TotalSeconds'],axis=1,inplace=True)


In [None]:
test_len = test.shape[0]

In [None]:
x_train = pd.get_dummies(pd.concat([train,test]),drop_first=True)

In [None]:
train = x_train[:test_len]
test = x_train[train.shape[0]:train.shape[0]+test_len]

In [None]:
train.drop(['Date'],axis=1,inplace=True)
test.drop(['Date','TotalMinutes'],axis=1,inplace=True)

In [None]:
y = train['TotalMinutes']
train.drop(['TotalMinutes'],axis=1,inplace=True)

In [None]:
train_new_df = train.copy()

In [None]:

import lightgbm as lgb

In [None]:
train_new_df.shape

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold

params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'subsample': 0.25,
            'subsample_freq': 1,
            'learning_rate': 0.3,
            'num_leaves': 20,
            'feature_fraction': 0.9,
            'lambda_l1': 1,  
            'lambda_l2': 1
            }

folds = 4
seed = 555

kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

models = []

for train_index, val_index in kf.split(train_new_df,y):
    train_X = train_new_df.iloc[train_index]
    val_X = train_new_df.iloc[val_index]
    train_y = y.iloc[train_index]
    val_y = y.iloc[val_index]
    lgb_train = lgb.Dataset(train_X, train_y)
    lgb_eval = lgb.Dataset(val_X, val_y)
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=(lgb_train, lgb_eval),
                early_stopping_rounds=200,
                verbose_eval = 100)
    models.append(gbm)

In [None]:
predictions = []
test['TotalMinutes'] = (sum([model.predict(test) for model in models])/folds)

In [None]:
test.to_csv("test.csv")