<h1>Pandas</h1>
<h3>What is a dataframe?</h3>
<p>A dataframe is a two-dimensional labelled data structure with columns of potentially different types.</p>
<ul>
    <li>Data</li>
    <li>Index</li>
    <li>Columns</li>
</ul>

In [2]:
import numpy as np
import pandas as pd

<h3>Structured arrays</h3>

In [25]:
# A structured array
my_array = np.ones(3, dtype=([("foo", int), ("bar", float)]))

# Print the structured array
print(f"Structured array: {my_array}")
print(f"Structured array int: {my_array['foo']}")
print(f"Structured array float: {my_array['bar']}")

# A record array
my_array2 = my_array.view(np.recarray)
# Print the record array
print(f'Record array int: {my_array2.foo}')
print(f'Record array float: {my_array2.bar}')

Structured array: [(1, 1.) (1, 1.) (1, 1.)]
Structured array int: [1 1 1]
Structured array float: [1. 1. 1.]
Record array int: [1 1 1]
Record array float: [1. 1. 1.]


<h3>Pandas series<h3>

<h5>One-dimensinal labeled array</h5>

<grid-container>
  <grid-item class='label'>Label</grid-item>
  <grid-item class='label'>Label</grid-item>
  <grid-item class='label'>Label</grid-item>
  <grid-item>Data</grid-item>
  <grid-item>Data</grid-item>
  <grid-item>Data</grid-item>
</grid-container>

<style>
  grid-container {
    display: grid;
    grid-template-columns: 1fr 1fr 1fr;
    grid-auto-rows: 75px;
    grid-gap: 5px;
    justify-content: center;
    align-content: center;
    text-align: center;
}

  grid-container {
    /* background-color: deepskyblue; */
    border: 1px solid #bbb;
    padding: 10px;
  }
  grid-item {
    background-color: deepskyblue;
    border: 1px solid #ccc;
  }
  .label{
    padding-bottom: 10px;
    background-color: grey;
  }
</style>

In [31]:
# numpy array
data = np.array(['a', 'b', 'c', 'd', 'e'])

# creating series
s = pd.Series(data)
print(s)

# creating modified index series
print('\nPandas series with modified index')
s = pd.Series(data, index =[1000, 1001, 1002, 1003, 1004])
print(s)

0    a
1    b
2    c
3    d
4    e
dtype: object

Pandas series with modified index
1000    a
1001    b
1002    c
1003    d
1004    e
dtype: object


<h3>Pandas Dataframe<h3>

<h5>Two-dimensinal labeled array</h5>

<grid-container>
  <grid-item class='label' style="opacity: 0"></grid-item>
  <grid-item class='label'>Label</grid-item>
  <grid-item class='label'>Label</grid-item>
  <grid-item class='label'>Label</grid-item>
  <grid-item>Data</grid-item>
  <grid-item>Data</grid-item>
  <grid-item class='label'>Label</grid-item>
  <grid-item>Data</grid-item>
  <grid-item>Data</grid-item>
</grid-container>

<style>
  grid-container {
    display: grid;
    grid-template-columns: 1fr 1fr 1fr;
    grid-auto-rows: 75px;
    grid-gap: 5px;
    justify-content: center;
    align-content: center;
    text-align: center;
}

  grid-container {
    /* background-color: deepskyblue; */
    border: 1px solid #bbb;
    padding: 10px;
  }
  grid-item {
    background-color: deepskyblue;
    border: 1px solid #ccc;
  }
  .label{
    padding-bottom: 10px;
    background-color: grey;
  }
  .grid-hiderows .item:nth-child(2) {
    visibility: hidden;
}
</style>

In [41]:
data = np.array([['','Col1','Col2'],
                ['Row1',1,2],
                ['Row2',3,4]])

# Start from second list and select all elements but first
print(f'data: {data[1:,1:]} \n')  
# Start from second list and select first elements only
print(f'index: {data[1:,0]} \n')
# Select first list and all elements after first 
print(f'Columns: {data[0,1:]} \n')

print(pd.DataFrame(data=data[1:,1:],
                  index=data[1:,0],
                  columns=data[0,1:]))

pd.DataFrame(data=data[1:,1:],
                  index=data[1:,0],
                  columns=data[0,1:])

data: [['1' '2']
 ['3' '4']] 

index: ['Row1' 'Row2'] 

Columns: ['Col1' 'Col2'] 

     Col1 Col2
Row1    1    2
Row2    3    4


Unnamed: 0,Col1,Col2
Row1,1,2
Row2,3,4


In [45]:
# Take a 2D array as input to your DataFrame
# Labels auto generated
my_2darray = np.array([[1, 2, 3], [4, 5, 6]])
print(pd.DataFrame(my_2darray))
print()

# Take a dictionary as input to your DataFrame
my_dict = {1: ["1", "3"], 2: ["1", "2"], 3: ["2", "4"]}
print(pd.DataFrame(my_dict))
print()

# Take a DataFrame as input to your DataFrame
my_df = pd.DataFrame(data=[4, 5, 6, 7], index=range(0, 4), columns=["A"])
print(pd.DataFrame(my_df))
print()

# Take a Series as input to your DataFrame
my_series = pd.Series(
    {
        "Belgium": "Brussels",
        "India": "New Delhi",
        "United Kingdom": "London",
        "United States": "Washington",
    }
)
print(pd.DataFrame(my_series))

   0  1  2
0  1  2  3
1  4  5  6

   1  2  3
0  1  1  2
1  3  2  4

   A
0  4
1  5
2  6
3  7

                         0
Belgium           Brussels
India            New Delhi
United Kingdom      London
United States   Washington


<h3>Dataframe Operations</h3>

In [57]:
df = pd.DataFrame(pd.Series(
    {
        "Belgium": "Brussels",
        "India": "New Delhi",
        "United Kingdom": "London",
        "United States": "Washington",
    }))
df

Unnamed: 0,0
Belgium,Brussels
India,New Delhi
United Kingdom,London
United States,Washington


In [53]:
df.shape

(4, 1)

In [54]:
len(df)

4

In [55]:
df.index

Index(['Belgium', 'India', 'United Kingdom', 'United States'], dtype='object')

In [56]:
df.columns

RangeIndex(start=0, stop=1, step=1)

In [58]:
df.T

Unnamed: 0,Belgium,India,United Kingdom,United States
0,Brussels,New Delhi,London,Washington


In [60]:
df.T.index

RangeIndex(start=0, stop=1, step=1)

In [63]:
df.T.columns

Index(['Belgium', 'India', 'United Kingdom', 'United States'], dtype='object')

In [64]:
df = pd.DataFrame({"A":[1,4,7], "B":[2,5,8], "C":[3,6,9]})
df

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


<h3>Indexing a dataframe<h3>

In [66]:
# Using `iloc[]`
print(f"Using iloc: {df.iloc[0][0]}")

# Using `loc[]`
print(f"Using loc: {df.loc[0]['A']}")

# Using `at[]`
print(f"Using at: {df.at[0,'A']}")

# Using `iat[]`
print(f"Using iat: {df.iat[0,0]}")

Using iloc: 1
Using loc: 1
Using at: 1
Using iat: 1


In [68]:
# Use `iloc[]` to select row `0`
print(df.iloc[0])

# Use `loc[]` to select column `'A'`
print(df.loc[:,'A'])

A    1
B    2
C    3
Name: 0, dtype: int64
0    1
1    4
2    7
Name: A, dtype: int64


<h3>Changing index</h3>

In [69]:
# Print out your DataFrame `df` to check it out
print(df)

# Set 'C' as the index of your DataFrame
df.set_index('C')

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
3,1,2
6,4,5
9,7,8
