In [1]:
library(data.table)
library(ggplot2)
library(dplyr)
library(lme4)
library(repr)

# Change plot size to 4 x 3
options(repr.plot.width = 4, repr.plot.height = 3, repr.plot.pointsize = 11)

DATA_DIR <- "../data/"


Attaching package: ‘dplyr’

The following objects are masked from ‘package:data.table’:

    between, first, last

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: Matrix


In [2]:
get_radon_data <- function() {
    DATA_PATH <- paste0(DATA_DIR, "radon/srrs2.dat")
    data <- read.table(DATA_PATH, header = T, sep = ",")
    return(data)
}

In [3]:
prepare_data_for_state  <- function(state) {
    radon_data <- get_radon_data()
    # Get data for state
    state_data <- radon_data[radon_data["state"] == state,]
    
    # Calculate the log radon levels
    state_data$log_radon <-
        log(ifelse(state_data$activity == 0 | is.na(state_data$activity), 
                   0.1, 
                   state_data$activity))
    
    # Code counties as indices
    state_data$county_idx <- as.integer(factor(state_data$county))
    
    # county means of the radon level
    state_data <- state_data %>%
        dplyr::group_by(county_idx) %>%
        dplyr::mutate(county_mean = mean(log_radon, na.rm = TRUE))
    
    # the number of obs per county
    state_data <- state_data %>%
        dplyr::group_by(county_idx) %>%
        dplyr::mutate(nobs_county = n())
    
    # total mean of the radon level
    state_data$total_mean <- mean(state_data$log_radon)
    
    # County level variance and overall variance
    state_data <- state_data %>%
        dplyr::group_by(county_idx) %>%
        dplyr::mutate(county_var = var(log_radon, na.rm = TRUE))
    
    # To calculate the variance between counties, we first collect the county
    # level eans yet again in a separate data.table
    df <- state_data %>%
        dplyr::group_by(county_idx) %>%
        dplyr::summarise(cnty_mean = mean(log_radon, na.rm = TRUE))
    
    state_data$total_var <- var(df$cnty_mean, na.rm = TRUE)
    
    return(state_data)
}

multlevel_radon <- function(state_data) {
    state_data$multlevel_mean <- 
    ((state_data$nobs_county / state_data$county_var) * state_data$county_mean + 
    (1 / state_data$total_var) * state_data$total_mean) / 
    ((state_data$nobs_county / state_data$county_var) + (1 / state_data$total_var))
    
    return(state_data)
}

sample_mean_vs_sample_sz <- function(data_table, nobs_cutoff) {
    df <- data_table[data_table$nobs_county < nobs_cutoff, ]
    # plot(df$nobs_county, df$county_mean)
    ggplot() + 
    geom_point(data = df, aes(x = df$nobs_county, y = df$county_mean), color = "blue") 
}

radon_vs_floor <- function(data_table, county_idx) {
    df <- data_table[data_table["county_idx"] == county_idx, ]
    # plot(df$floor, df$log_radon)
    ggplot() + 
    geom_point(data = df, aes(x = df$floor, y = df$log_radon), color = "blue") 
}
 
main <- function() {
    state_data <- prepare_data_for_state("MN")
    state_data <- multlevel_radon(state_data)
    
    return(state_data)
}