# Variance

In [1]:
using CSV, DataFrames, Query

In [2]:
df = CSV.read("C:/Data/CRSP/20180325_CRSP_daily_19630601_20171231.csv"; rows=5)

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,RET
1,10006,19630603,10,1,0.015464
2,10006,19630604,10,1,0.005076
3,10006,19630605,10,1,0.0
4,10006,19630606,10,1,0.002525


In [3]:
fname = "C:/Data/CRSP/20180325_CRSP_daily_19630601_20171231.csv";

In [32]:
# `nullable` true by default, but there missing values further down than
# CSV.read checks and a MissingException is raised without nullable=true
@time df = CSV.read(fname; nullable=true, types=[String, String, String, String, String]);

 71.608935 seconds (194.13 M allocations: 5.064 GiB, 60.85% gc time)


In [33]:
typeof(df), size(df)

(DataFrames.DataFrame, (19406009, 5))

In [34]:
head(df)

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,RET
1,10006,19630603,10,1,0.015464
2,10006,19630604,10,1,0.005076
3,10006,19630605,10,1,0.0
4,10006,19630606,10,1,0.002525
5,10006,19630607,10,1,-0.012594
6,10006,19630610,10,1,0.0


In [35]:
describe(df)

PERMNO
Summary Stats:
Length:         19406009
Type:           Union{Missings.Missing, String}
Number Unique:  5363
Number Missing: 0
% Missing:      0.000000

date
Summary Stats:
Length:         19406009
Type:           Union{Missings.Missing, String}
Number Unique:  13744
Number Missing: 0
% Missing:      0.000000

SHRCD
Summary Stats:
Length:         19406009
Type:           Union{Missings.Missing, String}
Number Unique:  2
Number Missing: 0
% Missing:      0.000000

EXCHCD
Summary Stats:
Length:         19406009
Type:           Union{Missings.Missing, String}
Number Unique:  1
Number Missing: 0
% Missing:      0.000000

RET
Summary Stats:
Length:         19406009
Type:           Union{Missings.Missing, String}
Number Unique:  218005
Number Missing: 8730
% Missing:      0.044986



In [38]:
Date("20140101", "yyyymmdd") - Date("20130101", "yyyymmdd")

365 days

In [42]:
# Upper case returns a period
Dates.year(Date("20130101", "yyyymmdd")), Dates.month(Date("20130101", "yyyymmdd")),  Dates.Month(Date("20130101", "yyyymmdd"))

(2013, 1, 1 month)

In [46]:
Dates.lastdayofmonth(Date("20130116", "yyyymmdd"))

2013-01-31

In [47]:
Dates.lastdayofmonth(Date("20130116", "yyyymmdd")) == Date("20130116", "yyyymmdd")

false

In [54]:
floor(Date("20130116", "yyyymmdd"), Dates.Month), ceil(Date("20130116", "yyyymmdd"), Dates.Month)

(2013-01-01, 2013-02-01)

In [49]:
Dates.tonext(Date(2018,3,30)) do x
    # Return true on the 4th Thursday of November (Thanksgiving)
    Dates.dayofweek(x) == Dates.Thursday &&
    Dates.dayofweekofmonth(x) == 4 &&
    Dates.month(x) == Dates.November
end

2018-11-22

In [51]:
# Pittsburgh street cleaning; Every 2nd Tuesday from April to November
# Date range from January 1st, 2014 to January 1st, 2015
dr = Dates.Date(2014):Dates.Date(2015);

filter(dr) do x
   Dates.dayofweek(x) == Dates.Tue &&
   Dates.April <= Dates.month(x) <= Dates.Nov &&
   Dates.dayofweekofmonth(x) == 2
end

8-element Array{Date,1}:
 2014-04-08
 2014-05-13
 2014-06-10
 2014-07-08
 2014-08-12
 2014-09-09
 2014-10-14
 2014-11-11

In [44]:
collect(Date("20130101", "yyyymmdd"):Date("20130115", "yyyymmdd"))

15-element Array{Date,1}:
 2013-01-01
 2013-01-02
 2013-01-03
 2013-01-04
 2013-01-05
 2013-01-06
 2013-01-07
 2013-01-08
 2013-01-09
 2013-01-10
 2013-01-11
 2013-01-12
 2013-01-13
 2013-01-14
 2013-01-15

In [45]:
collect(Date("20130101", "yyyymmdd"):Dates.Month(1):Date("20140101", "yyyymmdd"))

13-element Array{Date,1}:
 2013-01-01
 2013-02-01
 2013-03-01
 2013-04-01
 2013-05-01
 2013-06-01
 2013-07-01
 2013-08-01
 2013-09-01
 2013-10-01
 2013-11-01
 2013-12-01
 2014-01-01

In [30]:
size(collect(skipmissing(df[:RET])))

(19397279,)

In [25]:
sum(collect(skipmissing(df[:RET])).=="C")

2320

In [26]:
sum(collect(skipmissing(df[:RET])).=="B")

0

In [27]:
sum(collect(skipmissing(df[:RET])).=="D")

0

In [15]:
skipmissing(df[:RET][1:5])

Missings.EachSkipMissing{Array{Union{Missings.Missing, String},1}}(Union{Missings.Missing, String}["0.015464", "0.005076", "0.000000", "0.002525", "-0.012594"])

In [16]:
collect(skipmissing(df[:RET][1:5]))

5-element Array{String,1}:
 "0.015464" 
 "0.005076" 
 "0.000000" 
 "0.002525" 
 "-0.012594"

In [18]:
parse.(Float64, collect(skipmissing(df[:RET][1:5])))

5-element Array{Float64,1}:
  0.015464
  0.005076
  0.0     
  0.002525
 -0.012594

In [20]:
?CSV.read

`CSV.read(fullpath::Union{AbstractString,IO}, sink::Type{T}=DataFrame, args...; kwargs...)` => `typeof(sink)`

`CSV.read(fullpath::Union{AbstractString,IO}, sink::Data.Sink; kwargs...)` => `Data.Sink`

parses a delimited file into a Julia structure (a DataFrame by default, but any valid `Data.Sink` may be requested).

Minimal error-reporting happens w/ `CSV.read` for performance reasons; for problematic csv files, try [`CSV.validate`](@ref) which takes exact same arguments as `CSV.read` and provides much more information for why reading the file failed.

Positional arguments:

  * `fullpath`; can be a file name (string) or other `IO` instance
  * `sink::Type{T}`; `DataFrame` by default, but may also be other `Data.Sink` types that support streaming via `Data.Field` interface; note that the method argument can be the *type* of `Data.Sink`, plus any required arguments the sink may need (`args...`).                   or an already constructed `sink` may be passed (2nd method above)

Keyword Arguments:

  * `delim::Union{Char,UInt8}`: how fields in the file are delimited; default `','`
  * `quotechar::Union{Char,UInt8}`: the character that indicates a quoted field that may contain the `delim` or newlines; default `'"'`
  * `escapechar::Union{Char,UInt8}`: the character that escapes a `quotechar` in a quoted field; default `'\'`
  * `null::String`: indicates how NULL values are represented in the dataset; default `""`
  * `dateformat::Union{AbstractString,Dates.DateFormat}`: how dates/datetimes are represented in the dataset; default `Base.Dates.ISODateTimeFormat`
  * `decimal::Union{Char,UInt8}`: character to recognize as the decimal point in a float number, e.g. `3.14` or `3,14`; default `'.'`
  * `truestring`: string to represent `true::Bool` values in a csv file; default `"true"`. Note that `truestring` and `falsestring` cannot start with the same character.
  * `falsestring`: string to represent `false::Bool` values in a csv file; default `"false"`
  * `header`: column names can be provided manually as a complete Vector{String}, or as an Int/AbstractRange which indicates the row/rows that contain the column names
  * `datarow::Int`: specifies the row on which the actual data starts in the file; by default, the data is expected on the next row after the header row(s); for a file without column names (header), specify `datarow=1`
  * `types`: column types can be provided manually as a complete Vector{Type}, or in a Dict to reference individual columns by name or number
  * `nullable::Bool`: indicates whether values can be nullable or not; `true` by default. If set to `false` and missing values are encountered, a `Data.NullException` will be thrown
  * `footerskip::Int`: indicates the number of rows to skip at the end of the file
  * `rows_for_type_detect::Int=100`: indicates how many rows should be read to infer the types of columns
  * `rows::Int`: indicates the total number of rows to read from the file; by default the file is pre-parsed to count the # of rows; `-1` can be passed to skip a full-file scan, but the `Data.Sink` must be set up to account for a potentially unknown # of rows
  * `use_mmap::Bool=true`: whether the underlying file will be mmapped or not while parsing; note that on Windows machines, the underlying file will not be "deletable" until Julia GC has run (can be run manually via `gc()`) due to the use of a finalizer when reading the file.
  * `append::Bool=false`: if the `sink` argument provided is an existing table, `append=true` will append the source's data to the existing data instead of doing a full replace
  * `transforms::Dict{Union{String,Int},Function}`: a Dict of transforms to apply to values as they are parsed. Note that a column can be specified by either number or column name.
  * `transpose::Bool=false`: when reading the underlying csv data, rows should be treated as columns and columns as rows, thus the resulting dataset will be the "transpose" of the actual csv data.
  * `categorical::Bool=true`: read string column as a `CategoricalArray` ([ref](https://github.com/JuliaData/CategoricalArrays.jl)), as long as the % of unique values seen during type detection is less than 67%. This will dramatically reduce memory use in cases where the number of unique values is small.
  * `weakrefstrings::Bool=true`: whether to use [`WeakRefStrings`](https://github.com/quinnj/WeakRefStrings.jl) package to speed up file parsing; can only be `=true` for the `Sink` objects that support `WeakRefStringArray` columns. Note that `WeakRefStringArray` still returns regular `String` elements.

Example usage:

```
julia> dt = CSV.read("bids.csv")
7656334×9 DataFrames.DataFrame
│ Row     │ bid_id  │ bidder_id                               │ auction │ merchandise      │ device      │
├─────────┼─────────┼─────────────────────────────────────────┼─────────┼──────────────────┼─────────────┤
│ 1       │ 0       │ "8dac2b259fd1c6d1120e519fb1ac14fbqvax8" │ "ewmzr" │ "jewelry"        │ "phone0"    │
│ 2       │ 1       │ "668d393e858e8126275433046bbd35c6tywop" │ "aeqok" │ "furniture"      │ "phone1"    │
│ 3       │ 2       │ "aa5f360084278b35d746fa6af3a7a1a5ra3xe" │ "wa00e" │ "home goods"     │ "phone2"    │
...
```

Other example invocations may include:

```julia
# read in a tab-delimited file
CSV.read(file; delim='	')

# read in a comma-delimited file with null values represented as '\N', such as a MySQL export
CSV.read(file; null="\N")

# read a csv file that happens to have column names in the first column, and grouped data in rows instead of columns
CSV.read(file; transpose=true)

# manually provided column names; must match # of columns of data in file
# this assumes there is no header row in the file itself, so data parsing will start at the very beginning of the file
CSV.read(file; header=["col1", "col2", "col3"])

# manually provided column names, even though the file itself has column names on the first row
# `datarow` is specified to ensure data parsing occurs at correct location
CSV.read(file; header=["col1", "col2", "col3"], datarow=2)

# types provided manually; as a vector, must match length of columns in actual data
CSV.read(file; types=[Int, Int, Float64])

# types provided manually; as a Dict, can specify columns by # or column name
CSV.read(file; types=Dict(3=>Float64, 6=>String))
CSV.read(file; types=Dict("col3"=>Float64, "col6"=>String))

# manually provided # of rows; if known beforehand, this will improve parsing speed
# this is also a way to limit the # of rows to be read in a file if only a sample is needed
CSV.read(file; rows=10000)

# for data files, `file` and `file2`, with the same structure, read both into a single DataFrame
# note that `df` is used as a 2nd argument in the 2nd call to `CSV.read` and the keyword argument
# `append=true` is passed
df = CSV.read(file)
df = CSV.read(file2, df; append=true)

# manually construct a `CSV.Source` once, then stream its data to both a DataFrame
# and SQLite table `sqlite_table` in the SQLite database `db`
# note the use of `CSV.reset!` to ensure the `source` can be streamed from again
source = CSV.Source(file)
df1 = CSV.read(source, DataFrame)
CSV.reset!(source)
db = SQLite.DB()
sq1 = CSV.read(source, SQLite.Sink, db, "sqlite_table")
```
