# Basics of pandas.DataFrame

In [9]:
import numpy as np
import pandas as pd

## Create DataFrame

### From list

In [10]:
# from one dimensional list
df = pd.DataFrame([10,20,30],columns=['col1'])
df

Unnamed: 0,col1
0,10
1,20
2,30


In [11]:
# from multi dimensional list
df = pd.DataFrame([
    [10,20,30],
    [4,7,9]
], columns=['col1', 'col2', 'col3'])

df

Unnamed: 0,col1,col2,col3
0,10,20,30
1,4,7,9


### from  numpy arrays

In [12]:
# from multi dimensional nddarray
ndarr = np.arange(1,7).reshape(3,2)
print(ndarr)

df = pd.DataFrame(ndarr, index=['row1','row2','row3'],columns=['col1', 'col2'])
df

[[1 2]
 [3 4]
 [5 6]]


Unnamed: 0,col1,col2
row1,1,2
row2,3,4
row3,5,6


### from Python Dictionary

In [13]:
# from a dictionary of equally shaped arrays
# keys will be the columns names
# array values will be the data.

prices_dict = {
    "fruits": ["apples", "oranges", "bananas", "strawberries"],
    "prices": [1.5, 2, 2.5, 3],
    "suppliers": ["supplier1", "supplier2", "supplier4", "supplier3"],    
}

prices_df = pd.DataFrame(prices_dict, index = [1,2,3,4])
prices_df

Unnamed: 0,fruits,prices,suppliers
1,apples,1.5,supplier1
2,oranges,2.0,supplier2
3,bananas,2.5,supplier4
4,strawberries,3.0,supplier3


## Index DataFrame

### Selecting Columns

#### Select Single Column (Series Obj)

In [14]:
## select single column - square bracket notation:
prices_df['prices']


1    1.5
2    2.0
3    2.5
4    3.0
Name: prices, dtype: float64

In [15]:
## select single column
# attribute (dot) notation:
prices_df.prices

1    1.5
2    2.0
3    2.5
4    3.0
Name: prices, dtype: float64

#### square bracket vs dot notation
Note that square bracket notation is more canonical (can be used for 1 or multiple columns selection) and allows for any string to be used as selector. I.e you can't use the dot notation, if the column name contains spaces, or is a reserverd word (like max, min, etc.)


In [16]:
demo_df = pd.DataFrame([[1,2,3],[4,5,6]], columns=['col 1', 'col 2', 'col 3'])
demo_df.'col 1'

SyntaxError: invalid syntax (<ipython-input-16-08e0c6b4908f>, line 2)

In [None]:
# When we select a single column, the returned data is a Series object!
type(prices_df['prices'])

#### Select List of Columns

Note, that the columns will be selected in the order specified in the list

In [29]:
prices_df[['prices', 'fruits']]

Unnamed: 0,prices,fruits
1,1.5,apples
2,2.0,oranges
3,2.5,bananas
4,3.0,strawberries


The returned slice is a DataFrame object!

In [33]:
type(prices_df[['prices', 'fruits']])
prices_df

Unnamed: 0,fruits,prices,suppliers
1,apples,1.5,supplier1
2,oranges,2.0,supplier2
3,bananas,2.5,supplier4
4,strawberries,3.0,supplier3


*Note, that if you want to swap columns using the list indexing will swap only the columns name, but not the coresponding values:*

The way to swap columns with their respetive vlaues is discuused futher

In [34]:
prices_df[['fruits', 'prices']] = prices_df[['prices', 'fruits']]
prices_df

Unnamed: 0,fruits,prices,suppliers
1,1.5,apples,supplier1
2,2.0,oranges,supplier2
3,2.5,bananas,supplier4
4,3.0,strawberries,supplier3


### select columns with the loc method

**Syntax**: df.loc[row_indexer,column_indexer]

In [35]:
prices_df.loc[:, ['fruits', 'prices']]

Unnamed: 0,fruits,prices
1,1.5,apples
2,2.0,oranges
3,2.5,bananas
4,3.0,strawberries


In [None]:
# transform a subset of columns
# prices_df[['prices','fruits']] = prices_df[['fruits','prices']]

prices_df.loc[:,['prices','fruits']] = prices_df.loc[:,['fruits', 'prices']]

prices_df

### can load data from multiple file formats

http://pandas.pydata.org/pandas-docs/stable/io.html

In [None]:
data = pd.read_csv("../../datasets/drinks.csv")

data.head(5)

## usefull properties and methods

In [None]:
data.index

In [None]:
data.columns

In [None]:
data.shape

### get info and stats on the DataFrame

In [None]:
data.info()

In [None]:
data.describe()

### get rows

In [None]:
# by specifying integer location:
data.iloc[0]

In [17]:
data.iloc[0:5]

NameError: name 'data' is not defined

### get values

In [None]:
data.wine_servings.max()

In [None]:
data[data.wine_servings == data.wine_servings.max()]

## set_index()

In [None]:
new_data = data.set_index("country")

In [None]:
data.head(3)

In [None]:
new_data.head(3)

In [None]:
# remove the row index name
new_data.index.name = ""

### reset index (and return column back)

In [None]:
# we use loc[] to select by name location
new_data.loc['Albania']

In [None]:
# the row above was returned as Series object, but
# the slice is returned as DataFrame object
new_data.loc["Algeria":"Bahamas"]

### reset the index (and return the column back)

In [None]:
new_data.index.name = "Country"

In [None]:
new_data.reset_index(inplace=True)
new_data.head(3)

## Clean the values

In [None]:
dirty = pd.DataFrame({"col1": [1, 2, None, 4], "col2":[None, 4, 6, 9]})
dirty.head(3)

### Find all NaN vlaues

In [None]:
dirty.isnull().sum() 

### Fill all NaN vlaues with 0

In [None]:
clean = dirty.fillna(value=0).astype("int32")
clean.head(3)

In [None]:
# verify it is clear
clean.isnull().sum() 