## Julia in Data Modeling and Exploration

In [None]:
using CSVFiles # loading/saving csv
using DataFrames
using DataFramesMeta
using Dates
using RCall
using PyCall
using VegaDatasets
using Statistics
using DataVoyager
using VegaDatasets
ENV["COLUMNS"]=1000; # for dataframe column size

In [None]:
top5(x::DataFrame) = first(x,5)
last5(x::DataFrame) = last(x,5)

Let's use the `cars` dataset from VegaDatasets for modeling 

In [None]:
cars = dataset("cars") |> DataFrame
cars |> top5

## Data Voyager

Let's feed the `cars` dataset to the Voyager visualization tool and perform some exploratory visualization.

Note: This won't work in remote notebooks.

In [None]:
# cars |> Voyager()

## Julia SQL-like data processing workflow

## High-level queries to manipulate data

### Filter rows

In [None]:
@linq cars |> 
   where(:Origin .== "USA") |> 
   top5

In [None]:
@subset(cars,:Origin .== "USA") |> last5

### Filter rows and columns

In [None]:
@linq cars |> 
      where(:Origin .== "Europe") |> # filter rows
      select(:Name,:Year) |>         # subset of columns
      top5

### Grouping, Aggregation, and Sorting

In [None]:
mymean(x)=mean(skipmissing(x))

In [None]:
@linq cars |> 
      by(:Origin,
          meanAcc = mymean(:Acceleration),
          meanDisp = mymean(:Displacement),
          meanMPG=mymean(:Miles_per_Gallon)
      ) 

In [None]:
region_stats = @linq  cars |>
                      by([:Origin,:Cylinders],
                          MeanAcceleration=mymean(:Acceleration),
                          MeanHorsepower=mymean(:Horsepower)) |>
                      orderby(:Origin,:Cylinders)

In [None]:
res=combine(groupby(cars,[:Origin,:Cylinders])) do x
    macc=mymean(x.Acceleration)
    mincy=mymean(x.Horsepower)
    DataFrame(MeanAcceleration=macc,MeanHorsepower=mincy)
end

In [None]:
sort!(res,[:Origin,:Cylinders])

## R SQL-like data processing workflow

In [None]:
R"""
library(tidyverse)
library(ggplot2)
"""

In [None]:
rcars = cars
@rput rcars; # copy rcars to R environment

In [None]:
# rcars is visible in R
R"""
rcars %>%
  filter(Origin == "USA") %>%
  select(Origin,Cylinders,Horsepower) %>%
  head
"""

In [None]:
R"""
results = rcars %>%
          group_by(Origin,Cylinders) %>%
          summarise(
               MeanAcceleration=mean(Acceleration),
               MinHorsepower=min(Horsepower)
          )
"""

In [None]:
@rget results # copy results to Julia environment

## Data Visualization

In [None]:
R"""
library(ggplot2)

ggplot(rcars,aes(Horsepower,Acceleration)) +
   geom_point(aes(color=as.factor(Cylinders))) + 
   stat_smooth(method='lm')+
   facet_wrap( ~ Origin) 
"""

In [None]:
R"names(rcars)"

In [None]:
R"""
library(ggplot2)

ggplot(rcars,aes(Horsepower,Miles_per_Gallon)) +
   geom_point(aes(color=as.factor(Cylinders))) +
   stat_smooth(method='auto')
"""

In [None]:
R"""
library(ggplot2)

ggplot(rcars,aes(Displacement,Miles_per_Gallon)) +
   geom_point(aes(color=as.factor(Cylinders))) + 
   stat_smooth()
"""

In [None]:
R"""
library(ggplot2)

ggplot(rcars,aes(Weight_in_lbs,Miles_per_Gallon)) +
   geom_point(aes(color=as.factor(Cylinders))) +
   stat_smooth()
"""

In [None]:
sort(cars,:Acceleration) |> top5

In [None]:
R"""
library(randomForest)

ccars=rcars[complete.cases(rcars),]

rf_model = randomForest(Miles_per_Gallon ~ Cylinders + Displacement + 
                     Horsepower + Acceleration + Weight_in_lbs, data=ccars)
"""

In [None]:
R"varImpPlot(rf_model)"

In [None]:
R"""

ccars=rcars[complete.cases(rcars),]

lm_model = lm(Miles_per_Gallon ~ Cylinders + Displacement + 
              Horsepower + Acceleration + Weight_in_lbs, data=ccars)
"""

In [None]:
R"summary(lm_model)"

In [None]:
R"""
library(caret)

ccars=rcars[complete.cases(rcars),]

crt_model = train(Miles_per_Gallon ~ Cylinders + Displacement + 
                     Horsepower + Acceleration + Weight_in_lbs, data=ccars,method='rf')
"""

In [None]:
R"crt_model$finalModel"

In [None]:
R"varImpPlot(crt_model$finalModel)"