# Section 6: Perform a Simple EDA in R

### All code below is referenced from Lecture_1_2.ipynb provided by Gittu George for DSCI 525

In [None]:
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow.feather as feather
import rpy2_arrow.pyarrow_rarrow as pyra

In [14]:
dat.to_csv("figshareairline/final.csv")

### Method 1: Pandas Exchange

In [16]:
%%time
%%memit
# Only reading in a subset of the data to save time
pd_exchange_df = pd.read_csv("figshareairline/final.csv",nrows = 5_000_000)

peak memory: 3483.84 MiB, increment: 879.80 MiB
Wall time: 36.6 s


In [19]:
%%R
library(dplyr)
library(arrow)
library(lubridate)

R[write to console]: 
Attaching package: 'dplyr'


R[write to console]: The following objects are masked from 'package:stats':

    filter, lag


R[write to console]: The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


R[write to console]: 
Attaching package: 'lubridate'


R[write to console]: The following objects are masked from 'package:base':

    date, intersect, setdiff, union




In [21]:
%%time
%%R -i pd_exchange_df

start_time <- Sys.time()
result <- pd_exchange_df %>% count(model)
print(result)
end_time <- Sys.time()

print(end_time - start_time)

            model       n
1      ACCESS-CM2 1932840
2  AWI-ESM-1-1-LR  966420
3     FGOALS-f3-L  214520
4 MPI-ESM-1-2-HAM  966420
5      NorESM2-LM  919800
Time difference of 9.318514 secs
Wall time: 5min 7s


### Method 2: Arrow Exchange

In [22]:
%%time
%%memit

dataset = ds.dataset("figshareairline/final.csv", format="csv")
table = dataset.to_table()
r_table = pyra.converter.py2rpy(table)

6203
rarrow.ChunkedArray: 6.423601388931274
6203
rarrow.ChunkedArray: 0.14059185981750488
6203
rarrow.ChunkedArray: 0.04688262939453125
6203
rarrow.ChunkedArray: 0.03124713897705078
6203
rarrow.ChunkedArray: 0.04688453674316406
6203
rarrow.ChunkedArray: 0.06245994567871094
6203
rarrow.ChunkedArray: 0.031260013580322266
6203
rarrow.ChunkedArray: 0.04686927795410156
peak memory: 4584.58 MiB, increment: 2465.46 MiB
Wall time: 5min 27s
Compiler : 132 ms


In [24]:
%%time
%%R -i r_table

start_time <- Sys.time()
result <- r_table %>% collect() %>% count(model)
end_time <- Sys.time()
print(result)

print(end_time - start_time)

[38;5;246m# A tibble: 27 x 2[39m
   model                  n
   [3m[38;5;246m<chr>[39m[23m              [3m[38;5;246m<int>[39m[23m
[38;5;250m 1[39m ACCESS-CM2       1[4m9[24m[4m3[24m[4m2[24m840
[38;5;250m 2[39m ACCESS-ESM1-5    1[4m6[24m[4m1[24m[4m0[24m700
[38;5;250m 3[39m AWI-ESM-1-1-LR    [4m9[24m[4m6[24m[4m6[24m420
[38;5;250m 4[39m BCC-CSM2-MR      3[4m0[24m[4m3[24m[4m5[24m340
[38;5;250m 5[39m BCC-ESM1          [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 6[39m CanESM5           [4m5[24m[4m5[24m[4m1[24m880
[38;5;250m 7[39m CMCC-CM2-HR4     3[4m5[24m[4m4[24m[4m1[24m230
[38;5;250m 8[39m CMCC-CM2-SR5     3[4m5[24m[4m4[24m[4m1[24m230
[38;5;250m 9[39m CMCC-ESM2        3[4m5[24m[4m4[24m[4m1[24m230
[38;5;250m10[39m EC-Earth3-Veg-LR 3[4m0[24m[4m3[24m[4m7[24m320
[38;5;246m# ... with 17 more rows[39m
Time difference of 23.19594 mins
Wall time: 23min 45s


### Method 3: Parquet File

##### Write Time

In [25]:
%%time
%%memit
## writing as a single parquet 
pq.write_table(table, 'figshareairline/final.parquet')

peak memory: 6862.77 MiB, increment: 895.64 MiB
Wall time: 11min 56s


##### Read Time

In [None]:
%%time
%%R
start_time <- Sys.time()
pq_df <- arrow::read_parquet("figshareairline/final.parquet", col_select = c("time", "rain (mm/day)", "model"))
result <- pq_df %>% count(model)
end_time <- Sys.time()
print(result)

print(end_time - start_time)

### Method 4: Feather File

##### Write Time

In [None]:
%%time
%%memit
# experiment in writing in feather format 
feather.write_feather(table, 'figshareairline/final.feather')

##### Read Time

In [None]:
%%time
%%R

start_time <- Sys.time()
ft_df <- arrow::read_feather("figshareairline/final.feather", col_select = c("time", "rain (mm/day)", "model"))
result <- ft_df %>% count(model)
end_time <- Sys.time()

print(result)

print(end_time - start_time)

### Method Choice and Reasoning

### EDA Visualizations

##### Average Rainfall By Year

In [None]:
%%R
pq_df_year <- pq_df %>%
    group_by(year=floor_date(time, "year")) %>%
    summarise(year_rain = mean(`rain (mm/day)`))

In [None]:
%%R
pq_df_year %>%
    ggplot(aes(x=year, y=year_rain)) +
    geom_bar(stat='identity') +
    labs(title="Average Yearly Rainfall (mm/day)",
         x="Year", y="Rainfall (mm/day)")