## Instructions 
- Run **Data_prepare_for_lmer_R.ipynb** to create `train_processed_lmer.zip` and `valid_processed_lmer.zip` for this notebook
- Install required packages from below if necessary
- This Notebook will either run with one of the 2 cases mentioned below:
    - `Categorical variables` within the data:
        - State, Income_class, Density_class, Climate
    - `Kmeans categories` created:
        - 2 classes, 4 classes

In [228]:
# install.packages('plm')
# install.packages('nlme')
# install.packages('tidyverse')
# install.packages('lme4') # this is most imp
# install.packages('repr')
# install.packages('Metrics')
# install.packages('purrr')

In [1]:
suppressPackageStartupMessages(library(plm))
suppressPackageStartupMessages(library(nlme))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lme4))
suppressPackageStartupMessages(library(repr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(Metrics))
suppressPackageStartupMessages(library(purrr))


"package 'plm' was built under R version 3.6.2"
"package 'Metrics' was built under R version 3.6.3"


### Run only if the below exists after running the `Data_prepare_for_lmer_R.ipynb` notebook

In [2]:
data_train <- read_csv('../data/train_processed_lmer.zip')
data_valid <- read_csv('../data/valid_processed_lmer.zip')

Parsed with column specification:
cols(
  .default = col_double(),
  state = [31mcol_character()[39m
)

See spec(...) for full column specifications.

Parsed with column specification:
cols(
  .default = col_double(),
  state = [31mcol_character()[39m
)

See spec(...) for full column specifications.



In [3]:
head(data_train)

month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,monthly_count_ramp,...,HI,LI,MI,HD,LD,MD,A,C,D,unacast_session_count
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,...,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
9,2019,17,2,15,662117.6,109941.2,223000,0,0,...,0,1,0,0,1,0,0,1,0,82
5,2018,0,0,0,0.0,0.0,0,0,0,...,0,1,0,0,0,1,0,1,0,69
5,2019,0,0,0,0.0,0.0,0,0,0,...,0,0,1,0,0,1,0,1,0,333
4,2019,0,0,0,0.0,0.0,0,0,0,...,0,0,1,1,0,0,0,0,1,159
11,2018,0,0,0,0.0,0.0,0,0,0,...,1,0,0,1,0,0,0,1,0,151
6,2018,0,0,0,0.0,0.0,0,0,0,...,1,0,0,1,0,0,1,0,0,94


## Modelling LMER with State as random effect

In [186]:
fit_lmer <- function(data, class = 'state'){
    
    # removing non-compatible cols
    data <- select(data,  -monthly_rain, -monthly_avg_rain_length, -B14002e1, -not_enrolled)
    
    # scaling the other numeric cols
    data_train_numeric <- select(data, -income_class, -climate, -density_class, -state, -month, -year, -unacast_session_count)
    scaled_data <- scale(data_train_numeric, scale= TRUE)
    
    #scaling might cause NaN if all the values are 0
    scaled_data <- replace_na(scaled_data, 0)

    # combining required cols
    data_train <- cbind(data[c('month','year',class, 'unacast_session_count')], scaled_data)
    
    col_names <- colnames(data_train) 
    
    # getting part of the function equation as string
    x <- col_names[1]
    for (i in seq(2,length(col_names))){

        if (col_names[i] == class | col_names[i] == 'unacast_session_count'){
            next
        }
        else{
            x <- paste(x, " + ", col_names[i])   
        }
    }
    
#     print(x)
    model_intercept <- lmer(paste("unacast_session_count ~ ", x, " + (1 | ", class, ")"), data = data_train)
    
    return(model_intercept)
}

In [187]:
model_fit_intercept <- fit_lmer(data_train)

fixed-effect model matrix is rank deficient so dropping 106 columns / coefficients



In [214]:

get_error_lmer <- function(model, data, class = 'state'){
    
    y <- flatten_dbl(data['unacast_session_count'])
    # removing non-compatible cols
    data <- select(data,  -monthly_rain, -monthly_avg_rain_length, -B14002e1, -not_enrolled)
    # scaling the other numeric cols
    data_train_numeric <- select(data, -income_class, -climate, -density_class, -state, -month, -year, -unacast_session_count)
    
    scaled_data <- scale(data_train_numeric, scale = TRUE)
    
    #scaling might cause NaN if all the values are 0 
    scaled_data <- replace_na(scaled_data, 0)
    data_test <- cbind(data[c('month','year',class)], scaled_data)
    
    # predicting for new data
    y_pred <- predict(model, newdata = data_test, allow.new.levels = TRUE)
    
#     print(sum(is.na(y_pred)))
    rmse_val <- rmse(y, y_pred)
    mae_val <- mae(y, y_pred)
    
    print(paste("RMSE is = ", rmse_val, " and MAE is =",mae_val))
}

In [215]:
### Validation Score
get_error_lmer(model_fit_intercept, data_valid)

[1] "RMSE is =  199.760860939626  and MAE is = 107.942378024457"


In [216]:
### Train Score
get_error_lmer(model_fit_intercept, data_train)

[1] "RMSE is =  176.310058729571  and MAE is = 92.8561876374478"


## Modelling LMER with Density Class as random effect

In [217]:
model_fit_intercept <- fit_lmer(data_train, class = 'density_class')

fixed-effect model matrix is rank deficient so dropping 106 columns / coefficients



In [218]:
### Validation Score
get_error_lmer(model_fit_intercept, data_valid, class = 'density_class')

[1] "RMSE is =  199.701580321795  and MAE is = 108.631652494108"


In [219]:
### Train Score
get_error_lmer(model_fit_intercept, data_train, class = 'density_class')

[1] "RMSE is =  183.515241368643  and MAE is = 96.8058353455616"


## Modelling LMER with Income Class as random effect

In [220]:
model_fit_intercept <- fit_lmer(data_train, class = 'income_class')

fixed-effect model matrix is rank deficient so dropping 106 columns / coefficients



In [222]:
### Validation Score
get_error_lmer(model_fit_intercept, data_valid, class = 'income_class')

[1] "RMSE is =  199.756094008783  and MAE is = 108.698888327188"


In [224]:
### Train Score
get_error_lmer(model_fit_intercept, data_train, class = 'income_class')

[1] "RMSE is =  183.207196978883  and MAE is = 96.7288325160248"


## Modelling LMER with Climate as random effect

In [225]:
model_fit_intercept <- fit_lmer(data_train, class = 'climate')

fixed-effect model matrix is rank deficient so dropping 106 columns / coefficients



In [226]:
### Validation Score
get_error_lmer(model_fit_intercept, data_valid, class = 'climate')

[1] "RMSE is =  200.43671720903  and MAE is = 109.572947782972"


In [227]:
### Train Score
get_error_lmer(model_fit_intercept, data_train, class = 'climate')

[1] "RMSE is =  183.444116283311  and MAE is = 96.7522254940579"


## Modelling with Kmeans classes - Need to make required files with Kmeans labels

In [13]:
# fit_lmer <- function(data, class = 'Kmeans_4_label'){
    
#     # removing non-compatible cols
#     data <- select(data,  -monthly_rain, -monthly_avg_rain_length, -B14002e1, -not_enrolled)
    
#     # scaling the other numeric cols
#     data_train_numeric <- select(data, -Kmeans_2_label, -Kmeans_4_label, -month, -year, -unacast_session_count)
#     scaled_data <- scale(data_train_numeric, scale= TRUE)
    
#     #scaling might cause NaN if all the values are 0
#     scaled_data <- replace_na(scaled_data, 0)

#     # combining required cols
#     data_train <- cbind(data[c('month','year',class, 'unacast_session_count')], scaled_data)
    
#     col_names <- colnames(data_train) 
    
#     # getting part of the function equation as string
#     x <- col_names[1]
#     for (i in seq(2,length(col_names))){

#         if (col_names[i] == class | col_names[i] == 'unacast_session_count'){
#             next
#         }
#         else{
#             x <- paste(x, " + ", col_names[i])   
#         }
#     }
    
# #     print(x)
#     model_intercept <- lmer(paste("unacast_session_count ~ ", x, " + (1 | ", class, ")"), data = data_train)
    
#     return(model_intercept)
# }

In [14]:
# model_fit_intercept <- fit_lmer(data_train)

fixed-effect model matrix is rank deficient so dropping 109 columns / coefficients



In [15]:
# get_error_lmer <- function(model, data, class = 'Kmeans_4_label'){
    
#     y <- flatten_dbl(data['unacast_session_count'])
#     # removing non-compatible cols
#     data <- select(data,  -monthly_rain, -monthly_avg_rain_length, -B14002e1, -not_enrolled)
#     # scaling the other numeric cols
#     data_train_numeric <- select(data, -Kmeans_2_label, -Kmeans_4_label, -month, -year, -unacast_session_count)
    
#     scaled_data <- scale(data_train_numeric, scale = TRUE)
    
#     #scaling might cause NaN if all the values are 0 
#     scaled_data <- replace_na(scaled_data, 0)
#     data_test <- cbind(data[c('month','year',class)], scaled_data)
    
#     # predicting for new data
#     y_pred <- predict(model, newdata = data_test, allow.new.levels = TRUE)
    
# #     print(sum(is.na(y_pred)))
#     rmse_val <- rmse(y, y_pred)
#     mae_val <- mae(y, y_pred)
    
#     print(paste("RMSE is = ", rmse_val, " and MAE is =",mae_val))
# }

In [17]:
### Validation Score
# get_error_lmer(model_fit_intercept, data_valid, class = 'Kmeans_4_label')

[1] "RMSE is =  199.736639534294  and MAE is = 109.028203268506"


In [18]:
### Train Score
# get_error_lmer(model_fit_intercept, data_train, class = 'Kmeans_4_label')

[1] "RMSE is =  182.946270176817  and MAE is = 96.9325286639546"


In [20]:
# model_fit_intercept <- fit_lmer(data_train, class='Kmeans_2_label')

fixed-effect model matrix is rank deficient so dropping 109 columns / coefficients



In [21]:
### Validation Score
# get_error_lmer(model_fit_intercept, data_valid, class = 'Kmeans_2_label')

[1] "RMSE is =  199.805096744125  and MAE is = 108.926599650035"


In [23]:
### Train Score
# get_error_lmer(model_fit_intercept, data_train, class = 'Kmeans_2_label')

[1] "RMSE is =  182.940689852884  and MAE is = 96.8829414020046"
