# Data ingest: CSV
CSV (comma-separated values) files [(Wikipedia link)](https://en.wikipedia.org/wiki/Comma-separated_values) have a simple, human-readable text format for storing tabular data. Many different proprietary software for data handling and manipulation (e.g., process data historians) provide functionality for exporting data in CSV formats.

**Dataset used**
- iof_data.csv
- iof_data.parq

In [None]:
# Import libraries
import pandas as pd                 # pandas for data ingest and data manipulation

In [None]:
# Data links
data_url = {
    'iof_data_1min_csv' : "https://drive.google.com/uc?id=1_jYVXj7mt8Zzpjn8WGI111G-kWRTbfjU",
    'iof_data_1min_parq' : "https://drive.google.com/uc?id=1j5SS136UzbSPu8TqG9RRUMi6-wWF9dzq",
    'mixingTank' :  "https://drive.google.com/uc?id=1b5Qn5LIa6KAE03Tq4yRVdhTyUmZLxRjt",
    'moons' : "https://drive.google.com/uc?id=1a9zTkPEpuHGj6LzGzuLe-JSLg_4GJef4",
    'open_iof_20min' : "https://drive.google.com/uc?id=15lkhdBfWnjlpgpEx4T2XcRApKr-dmBb0",
    'open_iof_cleaned' : "https://drive.google.com/uc?id=1WVbJvYsGy-iKlsW4WaDZrKy_NhK2tJLW",
}

In [None]:
# Reading data (no additional specification)
fileName = data_url['iof_data_1min_csv']
df = pd.read_csv(fileName)

In [None]:
# Look at DataFrame contents
df

In [None]:
# Look at one column of DataFrame (a Series)
df["plant.feed.iron.comp"]

In [None]:
# Look at one column of DataFrame (a Series)
df.date

In [None]:
# Reading data (specify index column, parsing dates with defaults)
dateColumn = 'date'
df = pd.read_csv(fileName, index_col=dateColumn, parse_dates=[dateColumn])

In [None]:
# Look at DataFrame contents
df

In [None]:
# ATTRIBUTE: Data types in columns
df.dtypes

In [None]:
# ATTRIBUTE: Column names
df.columns

In [None]:
# ATTRIBUTE: Shape (of a DataFrame)
df.shape

In [None]:
# ATTRIBUTE: Shape (of a Series)
df["plant.flotation.sump01.starch.flow"].shape

In [None]:
# METHOD: Inspect first few rows
df.head()

In [None]:
# METHOD: Get basic information
df.info()

In [None]:
# METHOD: Get summary statistics (default is for continuous variables)
df.describe().T

In [None]:
# Inspect specific entries using .loc (named row and named column)
rowOfInterest = '2017-03-11 01:02:00'
columnOfInterest = 'plant.flotation.sump01.amina.flow'
df.loc[rowOfInterest,columnOfInterest]

In [None]:
# Inspect specific entries using .iloc (row and column position number)
# Remember, start counting at 0
rowOfInterest = 13
columnOfInterest = 3
df.iloc[rowOfInterest,columnOfInterest]

In [None]:
# Multiple rows and columns: .loc
rowsOfInterestStart = '2017/03/11 01:29:00'
rowsOfInterestEnd = '2017/03/11 01:31:00'
columnsOfInterest = ['plant.flotation.sump01.amina.flow','plant.flotation.sump01.discharge.flow']
df.loc[rowsOfInterestStart:rowsOfInterestEnd,columnsOfInterest]

In [None]:
# Multiple rows and columns: .iloc
# Remember, last item in range is not included
df.iloc[81:88,3:5]

In [None]:
# Multiple rows and columns: .loc with conditional rows
df.loc[df["plant.flotation.sump01.starch.flow"]>3800,:]

In [None]:
# Multiple rows and columns: .loc with conditional rows, specified columns
df.loc[df["plant.flotation.sump01.starch.flow"]>3800,["plant.flotation.sump01.starch.flow","plant.flotation.sump01.amina.flow"]]

In [None]:
# Create new data frame with selection and slicing
df_subset = df.loc[df["plant.flotation.sump01.starch.flow"]>3800,["plant.flotation.sump01.starch.flow","plant.flotation.sump01.amina.flow"]]
df_subset.describe().T

In [None]:
# Reading data (specify number of rows, specify specific columns)
dateColumn = 'date'
numberOfRows = 10
myColumns = ['date','plant.feed.iron.comp','plant.flotation.sump01.discharge.ph']
df = pd.read_csv(fileName, index_col=dateColumn, parse_dates=[dateColumn], nrows=numberOfRows, usecols=myColumns)

In [None]:
# Inspect data frame
df

In [None]:
# Reading data (larger subset)
dateColumn = 'date'
numberOfRows = 10000
df = pd.read_csv(fileName, index_col=dateColumn, parse_dates=[dateColumn])

In [None]:
# Simple plotting: One variable, against index
df['plant.flotation.sump01.amina.flow'].plot()

In [None]:
# Simple plotting: One variable, box plot
df['plant.flotation.sump01.amina.flow'].plot.box()

# Data ingest: Parquet files
Parquet files [(Wikipedia link)](https://en.wikipedia.org/wiki/Apache_Parquet) is a data storage format which provides efficient data compression.

In [None]:
# Reading data
fileName = data_url['iof_data_1min_parq']
df = pd.read_parquet(fileName)

In [None]:
# Look at DataFrame contents
df