# Initialisation
Select the mode in `pandas`, `cudf`, `dask` or `dask-cudf`.

In [None]:
#%reload_ext autoreload
#%autoreload 2
import os
os.environ["VDF_MODE"] = "cudf"  # Reset the kernel if you change this

#import cardif_dask as vdf  # Import Virtual Dataframe
import virtual_dataframe as vdf
import pandas as pd
import cupy as cp

print(f"Use {vdf.VDF_MODE}")

# Object Creation
Creating a `VSeries`

In [None]:
s = vdf.VSeries([1,2,3,None,4],npartitions=2)
s.compute()

Creating a `VDataFrame` by specifying values for each column.

In [None]:
df = vdf.VDataFrame({'a': list(range(20)),
                 'b': list(reversed(range(20))),
                 'c': list(range(20))
                }, npartitions=2)
df.compute()

Creating a `VDataFrame` from a pandas `Dataframe`.

> Note that best practice for `VDataFrame` is to read data directly into a ̀`VDataFrame` with something like `read_csv()` (discussed below).

In [None]:
pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]})
df = vdf.from_pandas(pdf,npartitions=2)
df.compute()

In [None]:
ps = pd.Series([1,2,3,None,4])
s = vdf.from_pandas(ps,npartitions=2)
s.compute()

# Viewing Data
Viewing the top rows of a `VDataFrame`.

In [None]:
df.head(2).compute()

In [None]:
df.sort_values(by='b').compute()

# Selection
## Getting
Selecting a single column, which initially yields a `VSeries`.

In [None]:
df['a'].compute()

## Selection by Label
Selecting rows from index 2 to index 5 from columns ‘a’ and ‘b’.

In [None]:
df.loc[2:5, ['a', 'b']].compute()

## Selection by Position
Selecting via integers and integer slices, like numpy/pandas.
> Note that this functionality is not available for `dask-cudf`.

In [None]:
df.iloc[0].compute() if vdf.VDF_MODE != vdf.Mode.dask_cudf else None

You can also select elements of a `VDataFrame` or `VSeries` with direct index access.

In [None]:
df[3:5].compute()

In [None]:
s[3:5].compute()

## Boolean Indexing
Selecting rows in a `VDataFrame` or `VSeries` by direct `Boolean` indexing.

In [None]:
df[df.b > 15].compute()

Selecting values from a `DataFrame` where a `Boolean` condition is met, via the query API.

In [None]:
df.query("b == 3").compute()

You can also pass local variables to Dask-cuDF queries, via the local_dict keyword.
With standard cuDF, you may either use the local_dict keyword or directly pass the variable
via the @ keyword. Supported logical operators include >, <, >=, <=, ==, and !=.

In [None]:
value = 3
df.query("b == @value").compute()

In [None]:
value = 3
df.query("b == @val", local_dict={'val':value}).compute()

Using the isin method for filtering.

In [None]:
df[df.a.isin([0, 5])].compute()

# MultiIndex
Virtual Dataframe supports hierarchical indexing of DataFrames using MultiIndex.
Grouping hierarchically (see Grouping below) automatically produces a DataFrame with a MultiIndex.

In [None]:
# FIXME
arrays = [['a', 'a', 'b', 'b'], [1, 2, 3, 4]]
tuples = list(zip(*arrays))
idx = vdf.MultiIndex.from_tuples(tuples)
idx

In [None]:
import random
import array
rand4_1=array.array('d',[random.random() for i in range(0,4)])
rand4_2=array.array('d',[random.random() for i in range(0,4)])
# FIXME rand4= cp.random.rand(4)

In [None]:
df1 = vdf.VDataFrame({'first': rand4_1, 'second': rand4_2})
df1.index = idx
df1

In [None]:
df2 = vdf.VDataFrame({'first': rand4_1, 'second': rand4_2}).T
df2.columns = idx
df2

In [None]:
df1.loc[('b', 3)]

# Missing Data
Missing data can be replaced by using the fillna method.

In [None]:
s.fillna(999).compute()

# Operations
## Stats
Calculating descriptive statistics for a Series.

In [None]:
vdf.compute(s.mean())[0], vdf.compute(s.var())[0]

# Applymap
Applying functions to a Series.
Note that applying user defined functions directly with Dask-cuDF is not yet implemented.
For now, you can use map_partitions to apply a function to each partition of the distributed dataframe.

In [None]:
def add_ten(num):
    return num + 10

df['a'].apply(add_ten).compute()

In [None]:
# Not with pandas
if vdf.VDF_MODE in (vdf.Mode.dask, vdf.Mode.dask_cudf):
    df['a'].map_partitions(add_ten).compute()

# Histogramming
Counting the number of occurrences of each unique value of variable.

In [None]:
df.a.value_counts().compute()

# String Methods
Virtual Dataframe provides string processing methods in the str attribute of Series.

In [None]:
s = vdf.VSeries(['A', 'B', 'C', 'Aaba', 'Baca', None, 'CABA', 'dog', 'cat'], npartitions=2)
s.str.lower().compute()

# Concat
Concatenating VSeries and VDataFrames row-wise.

In [None]:
s = vdf.VSeries([1, 2, 3, None, 5],npartitions=2)
vdf.concat([s, s]).compute()

# Join
Performing SQL style merges.
Note that the dataframe order is not maintained, but may be restored post-merge by sorting by the index.

In [None]:
df_a = vdf.VDataFrame()
df_a['key'] = ['a', 'b', 'c', 'd', 'e']
df_a['vals_a'] = [float(i + 10) for i in range(5)]

df_b = vdf.VDataFrame()
df_b['key'] = ['a', 'c', 'e']
df_b['vals_b'] = [float(i+100) for i in range(3)]

merged = df_a.merge(df_b, on=['key'], how='left').compute()
merged

# FIXME Grouping
Virtual Dataframe support the Split-Apply-Combine groupby paradigm.

In [None]:
# FIXME
df['agg_col1'] = [1 if x % 2 == 0 else 0 for x in range(len(df))]
df['agg_col2'] = [1 if x % 3 == 0 else 0 for x in range(len(df))]

ddf = vdf.from_cudf(df, npartitions=2)


Grouping and then applying the sum function to the grouped data.

In [None]:
df.groupby('agg_col1').sum().compute()

Grouping hierarchically then applying the sum function to grouped data.

In [None]:
df.groupby(['agg_col1', 'agg_col2']).sum().compute()

Grouping and applying statistical functions to specific columns, using agg.

In [None]:
# FIXME
df.groupby('agg_col1').agg({'a':'max', 'b':'mean', 'c':'sum'}).compute()

# Transpose
Transposing a dataframe, using either the transpose method or `T` property.
Currently, all columns must have the same type.

> Transposing is not currently implemented in `dask-cudf`.

In [None]:
df = vdf.VDataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
df

In [None]:
df.transpose().compute()

# FIXME Time Series
`VDataFrames` supports datetime typed columns, which allow users to interact with and filter data based on specific timestamps.

In [None]:
# FIXME
rand_len = cp.random.sample(len(date_df))

In [None]:
#FIXME
import datetime as dt

date_df = vdf.VDataFrame()
date_df['date'] = pd.date_range('11/20/2018', periods=72, freq='D')
date_df['value'] = rand_len

search_date = dt.datetime.strptime('2018-11-23', '%Y-%m-%d')
date_df.query('date <= @search_date')

In [None]:
# FIXME
date_ddf = dask_cudf.from_cudf(date_df, npartitions=2)
date_ddf.query('date <= @search_date', local_dict={'search_date':search_date}).compute()

# FIXME Categoricals
VDataFrames support categorical columns.

In [None]:
# FIXME
cdf = vdf.VDataFrame({"id": [1, 2, 3, 4, 5, 6], "grade":['a', 'b', 'b', 'a', 'a', 'e']}, npartitions=2)
cdf['grade'] = df['grade'].astype('category')
cdf.compute()

Accessing the categories of a column.

> Note that this is currently not supported in `dask-cudf`.

In [None]:
# FIXME
cdf.grade.cat.categories

Accessing the underlying code values of each categorical observation.

In [None]:
cdf.grade.cat.codes.compute()

# Converting Data Representation
## Pandas
Converting a `VDataFrame` to a pandas DataFrame.

In [None]:
df.head().to_pandas()

## Numpy
Converting a `VDataFrame` to a numpy ndarray.

In [None]:
df.to_numpy()

Converting a `VSeries` to a numpy ndarray.

In [None]:
df['a'].to_numpy()

## Arrow
Converting a `VDataFrame` to a PyArrow Table.

> Note that this functionality is not available for `pandas` or `dask`.

In [None]:
if vdf.VDF_MODE in (vdf.Mode.cudf, vdf.Mode.dask_cudf):
    df.to_arrow()

# Getting Data In/Out
## CSV
Writing to a CSV file.

In [None]:
if not os.path.exists('example_output'):
    os.mkdir('example_output')

df.to_csv('example_output/foo.csv', index=False)

Reading from a csv file.

In [None]:
df = vdf.read_csv('example_output/foo.csv')
df.compute()

Reading all CSV files in a directory into a single dask_cudf.DataFrame, using the star wildcard.
> Not implemented in pandas or cudf ?

In [None]:
# FIXME: extends pandas
if vdf.VDF_MODE in (vdf.Mode.dask, vdf.Mode.dask_cudf):
    df = vdf.read_csv('example_output/*.csv')
    df.compute()

# Parquet
Writing to parquet files, using the CPU via PyArrow.

In [None]:
df.to_parquet('example_output/temp_parquet')

## ORC
Reading ORC files.
> FIXME

In [None]:
if vdf.VDF_MODE in (vdf.Mode.dask, vdf.Mode.dask_cudf):
    from pathlib import Path
    cudf_root = Path(".").absolute().parents[3]
    orc_file = Path("python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc")
    file_path = cudf_root / orc_file

In [None]:
# FIXME
df2 = vdf.read_orc(file_path)
df2