In [None]:
using DataFrames
using CSV

In [None]:
# Create data - DataFrame(colName=dataArray, ...) or DataFrame(matrixData, arrayColName) or DataFrame(matrixData, :auto)
#   the columns names can be a array of string or symbols
foods = ["apple", "cucumber", "tomato", "banana"]
calories = [105,47,22,105]
prices = [0.85,1.6,0.8,0.6,]
aq = [10.0   8.04  10.0  9.14  10.0   7.46   8.0   6.58
       8.0   6.95   8.0  8.14   8.0   6.77   8.0   5.76
      13.0   7.58  13.0  8.74  13.0  12.74   8.0   7.71
       9.0   8.81   9.0  8.77   9.0   7.11   8.0   8.84
      11.0   8.33  11.0  9.26  11.0   7.81   8.0   8.47
      14.0   9.96  14.0  8.1   14.0   8.84   8.0   7.04
       6.0   7.24   6.0  6.13   6.0   6.08   8.0   5.25
       4.0   4.26   4.0  3.1    4.0   5.39  19.0  missing
      12.0  10.84  12.0  9.13  12.0  missing 8.0   5.56
       7.0   4.82   7.0  7.26   7.0   6.42   8.0   7.91
       5.0   5.68   5.0  4.74   5.0   5.73   8.0   6.89]

dataframe_calories = DataFrame(item=foods,calories=calories)
dataframe_prices = DataFrame(item=foods,price=prices)

df = CSV.read("data/programming_languages.csv", DataFrame)
language = "Julia"  # cols: year and language

df2 = DataFrame(aq, :auto)
colNames = vec(string.(["x", "y"], [1 2 3 4]))

df3 = CSV.File("data/auto2.csv") |> DataFrame;

In [None]:
names(df) # return columns names

newnames = vec(string.(["x", "y"], [1 2 3 4]))
rename!(df2, newnames) # rename columns
rename!(df, ["year","language"])

nrow(df) # numb of rows
ncol(df) # numb of cols

describe(df) #summarize the contents

Unnamed: 0_level_0,variable,mean,min,median,max,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Int64,DataType
1,year,1982.99,1951,1986.0,2014,0,Int64
2,language,,ALGOL 58,,dBase III,0,String


In [None]:
# Join, can be: outerjoin, leftjoin, rightjoin, semijoin, antijoin, crossjoin
DF = innerjoin(dataframe_calories, dataframe_prices, on=:item);

In [None]:
# Data type and slices
@show typeof(df)
df[1:10,:] # df.year n df.year[1:10]

# Create a new col with a list
df.id = 1:nrow(df)

# Compare each value of col 2 with language var
#   return a Bool vector
df[:,2] .== language

# To make a slice
df[df.language .== language, :];

In [None]:
# Reorder columns - use a regex, in this case, get first all cols start with x and then the rest of them (:)
select!(df2, r"x", :)

# Data Transformation - create new columns
#   ByRow(source_columns -> transformation[] => destination_columns)
new_df = select(df, :year => ByRow(x -> [x,Float64(x)+1]) => ["year01","year02"])

# Count how much missing values in the columns/rows
#   eachcol(df) return the iterator over the columns of the DataFrame
#   eachrow(df) return the iterator over the rows of the DataFrame
missing_sum_cols = sum(count(ismissing, col) for col in eachcol(df2))
missing_sum_rows = sum(count(ismissing, row) for row in eachrow(df2))

# Sum
rdm_sum = sum(df2[:,1])

# Map Columns - get each cols at a time
missing_map = mapcols(x -> count(ismissing, x), df2)

# Transformation
transform!(df2,
           :x1 => ByRow(x -> (x-1) / factorial(x)) => :theory,
           renamexols=true)

# Filter - get each row at a time
filt1 = filter(row -> any(ismissing, row), df2) # see in all cols
filt2 = filter(row -> row.y4 == language, df2) # see only in col y4
filt3 = filter(:y4 -> ==(language), df2) # same as above
    # 'row -> row.y4 == language'  ===  'y4 -> ==(language)'

# Group by
gb = groupby(df3, :brand, sort=true)
group = gb[("ford",)] # get a group - return all row with yeas as 1958
agg_gb = combine(gb, :mpg => mean => :avg_mpg) # create a agg per group
                #source_columns => agg or anonymus func [=> destination_columns]

# Chain of operations
brand_origin = @pipe df |>  # _ will get the element pass previously
                    groupby(_, :brand) |>
                    combine(_, :origin => x -> length(unique(x)))
count_brand = @pipe df |>
                    groupby(_, [:brand, :origin]) |>
                    combine(_, nrow)
origin_brand = @pipe df |>
                    groupby(_, :origin) |>  # with Ref, the vector it is not spread
                    combine(_, :brand => x -> Ref(unique(x))) # the elements is a vector

# Flatten - expand the column in Ref; it's like the above but without the Ref()
flatten(origin_brand, :brand_function)

# Sort
sort(agg_gb, :avg_brand, rev=true)

# Frequency Table - use FreqTables.jl
freqtable(df3, :brand, :origin);

In [None]:
# Allow missing data in columns
allowmising!(df, :year)

# Disallow missing data in columns
disallowmising!(df, :year)

# Replace missing values
coalesce.(df2, 0)

# Drop missing values
dropmissing(df2);

UndefVarError: UndefVarError: allowmising! not defined

In [None]:
c = df[:, :x1] # this will copy the data, so it'll store in a diff place
n = df[!, :x1] # this will no copy the data, point to same memory point
c === df.x1 # False
n === df.x1 # True

true

In [None]:
# Ways to create a DataFrame

df_raw = CSV.File("data/auto.txt", header=[:metrics, :name]) |> DataFrame;

# First way
str_metrics = split.(df_raw.metrics) # split a column by space
df1_2 = DataFrame([col => Float64[] for # create a empty DataFrame with define cols n col data type
                  col in [:mpg, :cylinders, :displacement, :horsepower, :weight, :acceleration, :year, :origin]])
allowmissing!(df1_2, [:mpg, :horsepower]) # allow missing value in columns 'mpg' n 'horsepower'
for row in str_metrics # populate the DataFrame
    push!(df1_2, [v == "NA" ? missing : parse(Float64, v) for v in row])
end
df1_2.name = df_raw.name # add String variables - both will be the same object (=== is True)
df1_2.name = df_raw[:, :name] # do this to create a copy (=== is False)

# Second way
df1_3 = select(df_raw,
               :metrics => # get the col and do, by row, a operation over x, the return will be set using a col array
                   ByRow(x -> something.(tryparse.(Float64, split(x)), missing)) =>
                   [:mpg, :cylinders, :displacement, :horsepower, :weight, :acceleration, :year, :origin],
               :name)