In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1.Loading Datasets

We'll be using the Kaggle Heart Disease UCI dataset as an example. You can find it here: https://www.kaggle.com/ronitf/heart-disease-uci

* Manual loading (last resort)
* `np.loadtxt`
* `np.genfromtxt`
* `pd.read_csv`
* `pd.read*`
* `pickle`

In [None]:
import numpy as np
import pandas as pd
import pickle

filename = "/kaggle/input/heart-disease-uci/heart.csv"

## The best method - panda's read_csv
Handles the most edge cases, datetime and file issues best.

In [None]:
df = pd.read_csv(filename)
df.head()

## Using numpy's loadtxt and genfromtxt

If you must. Notice it fails without extra arguments - its not as smart and we have to tell it what to do. Designed for loading in data saved using `np.savetxt`, not meant to be a robust loader.

In [None]:
data = np.loadtxt(filename, delimiter=",", skiprows=1)
print(data)

In [None]:
data = np.genfromtxt(filename, delimiter=",", dtype=None, names=True, encoding="utf-8-sig")
print(data)
print(data.dtype)

## Manual Loading
For completely weird file structures


In [None]:
def load_file(filename):
    with open(filename, encoding="utf-8-sig") as f:
        data, cols = [], []
        for i, line in enumerate(f.read().splitlines()):
            if i == 0:
                cols += line.split(",")
            else:
                data.append([float(x) for x in line.split(",")])
#         print(cols)
#         print(data)
        df = pd.DataFrame(data, columns=cols)
    
    return df
load_file(filename).head()

## Pickles!
Some danger using pickles as encoding changes. Use an industry standard like hd5 instead if you can. Note if you're working with dataframes, dont use python's `pickle`, pandas has their own implementation - `df.to_pickle` and `df.read_pickle`. Underlying algorithm is the same, but less code for you to type, and supports compression.

In [None]:
df = pd.read_pickle("../input/heartdiseaseuci/heart.pkl")
df.head()

### Recap

* Use pd.read_csv 99% of the time
* Use pd.read_* for other cases (pd.read_excel, pd.read_pickle, etc)
* If pd cant handle it, I doubt numpy can
* If you use a manual function, save your data to a sensible format

# 2.Numpy vs Pandas
1. Pandas has a numpy core.
2. Extra structure and tools, but sometimes you have to strip it away

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv("/kaggle/input/heart-disease-uci/heart.csv")

In [None]:
data = df.to_numpy()
# df.values # same
data

See this link for details on why not to use `.values`: https://pandas-docs.github.io/pandas-docs-travis/whatsnew/v0.24.0.html#accessing-the-values-in-a-series-or-index (https://bit.ly/2RrecCR for convenience)

In [None]:
df.head()

In [None]:
print(data.dtype, data)

In [None]:
data[0, 0] = 100
df.head()
# Notice it doesnt modify the original data frame here

In [None]:
# But we can still explicitly call copy if we wanted
df2 = df[["age", "sex", "cp"]]
data2 = df2.to_numpy().copy()
data2[0, 0] = 100
df2

In [None]:
df.head()

In [None]:
df.age.to_numpy()[0] = 100
df
# Here it DOES modify the original, because to_numpy hasnt needed to make a new array
# so our reference is to the original, underlying data

In [None]:
# Many functions you can do in both pandas and numpy
print(df['age'].mean(), df['age'].to_numpy().mean())

In [None]:
# Some are pandas only
print(df['age'].quantile(0.5))
# print(df['age'].to_numpy().quantile(0.5))

In [None]:
# And some are numpy only
# print(df['age'].reshape((3, -1)))
print(df['age'].to_numpy().reshape((3, -1)))

Most of the time, better to keep things in DataFrame format, as you can do more. For some cases, you might need to swap to numpy format, and that's fine.

### Recap:
* Work with pandas as much as you can, more functionality
* Sometimes you need to get the actual array, and use to_numpy()

# 3.Creating DataFrames

Many ways to do it!

In [None]:
import pandas as pd
import numpy as np

data = np.random.random(size=(5, 3))
print(data)

# Common 2D array and columns method
df = pd.DataFrame(data=data, columns=["A", "B", "C"])
df

In [None]:
# A dictionary of columns
df = pd.DataFrame(data={"A": [1, 2, 3], "B": ["Sam", "Alex", "John"]})
df

In [None]:
# Or a list of rows (ie tuples) with a dtype
dtype = [("A", np.int), ("B", (np.str, 20))]
data = np.array([(1, "Sam"), (2, "Alex"), (3, "John")], dtype=dtype)
df = pd.DataFrame(data)
df

In [None]:
# Or the dictionary based version of list of rows
data = [{"A": 1, "B": "Sam"}, {"A": 2, "B": "Alex"}, {"A": 3, "B": "John"}]
df = pd.DataFrame(data)
df

# 4.Saving and Serialising a dataframe


In [None]:
import numpy as np
import pandas as pd

# Lets make a new dataframe and save it out using various formats
df = pd.DataFrame(np.random.random(size=(100000, 4)), columns=["A", "B", "C", "D"])
df.head()

In [None]:
df.to_csv("save.csv", index=False, float_format="%0.4f")

In [None]:
df.to_pickle("save.pkl")

In [None]:
# pip install tables
df.to_hdf("save.hdf", key="data", format="table")

In [None]:
# pip install feather-format
df.to_feather("save.fth")

In [None]:
# If you want to get the timings you can see in the video, you'll need this extension:
# https://jupyter-contrib-nbextensions.readthedocs.io/en/latest/nbextensions/execute_time/readme.html

Now this is a very easy test - its only numeric data. If we add strings and categorical data things can slow down a lot! Let's try this on mixed Astronaut data from Kaggle: https://www.kaggle.com/nasa/astronaut-yearbook

In [None]:
df = pd.read_csv("../input/astronaut-yearbook/astronauts.csv")
df.head()

In [None]:
df.to_csv("save.csv", index=False, float_format="%0.4f")

In [None]:
pd.read_csv("./save.csv");

In [None]:
df.to_pickle("save.pkl")

In [None]:
pd.read_pickle("./save.pkl");

In [None]:
df.to_hdf("save.hdf", key="data", format="table")

In [None]:
pd.read_hdf("./save.hdf");

In [None]:
df.to_feather("save.fth")

In [None]:
pd.read_feather("./save.fth");

In [None]:
%ls

### Recap

In terms of file size, HDF5 is the largest for this example. Everything else is approximately equal. For small data sizes, often csv is the easiest as its human readable. HDF5 is great for *loading* in huge amounts of data quickly. Pickle is faster than CSV, but not human readable.

Lots of options, don't get hung up on any of them. csv and pickle are easy and for most cases work fine.

# 5.Inspecting Data

Astronaut data from Kaggle: https://www.kaggle.com/nasa/astronaut-yearbook

In [None]:
import pandas as pd

df = pd.read_csv("../input/astronaut-yearbook/astronauts.csv")

In [None]:
# First two rows of the dataframe
df.head(2)

In [None]:
# Last row of the dataframe
df.tail(1)

In [None]:
# Three random but different rows in the dataframe
# (set replace=True to allow them to potentially double up)
df.sample(3)

In [None]:
# The type and number of non-null values for each column
df.info()

In [None]:
# Basic stats on all numeric columns
df.describe()

In [None]:
# Shape of the dataframe (nrows, ncols)
df.shape

In [None]:
# Correlation between all numeric columns
df.corr()

In [None]:
# The number of each occurance for a series
df["Year"].value_counts()

In [None]:
# And a whole host of math functions can be invoked on the dataframe as whole, like so
df.max()

In [None]:
df.min()

### Recap
* head
* tail
* sample
* info
* describe