# Pandas

In [3]:
from IPython.display import display
import pandas as pd
import numpy as np

from sklearn.datasets import load_iris

# Table of contents

 - Pandas Series
     - Initialize a Pandas Series
     - Create a Pandas Series from a dictionary
     - Masking in Pandas Series
     - Memory concerns in Pandas Series
 - Pandas DataFrames
     - Initialize a Pandas Dataframe
     - Create a Pandas DataFrame from a dictionary
     - Double Index with Pandas DataFrames
     - Memory concerns in Pandas DataFrames

# Pandas Series

## Initialize a Pandas series

In [20]:
pd.Series(data=["Larry", "Bill", "Mark", "Steve"], 
               index=["Google", "Microsoft", "Facebook", "Apple"])

Google       Larry
Microsoft     Bill
Facebook      Mark
Apple        Steve
dtype: object

## Create a Pandas Series from a dictionary

In [21]:
my_dict = {"Google": "Larry",
           "Microsoft": "Bill",
           "Facebook": "Mark",
           "Apple": "Steve"}

my_series = pd.Series(my_dict)
my_series

Google       Larry
Microsoft     Bill
Facebook      Mark
Apple        Steve
dtype: object

## Masking in Pandas Series

We can mask the series based on its values.

Here's a normal series

In [25]:
s = pd.Series(range(5))
s

0    0
1    1
2    2
3    3
4    4
dtype: int64

Masked with **where**

In [35]:
s.where(s > 0)

0    NaN
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

*Notice how the series was converted to float.*

Now with **mask** (does the opposite of **where**)

In [37]:
s.mask(s > 0)

0    0.0
1    NaN
2    NaN
3    NaN
4    NaN
dtype: float64

## Memory Concerns in Pandas Series

Int8 for 8 bits which can store signed integers between [-127,127]

In [71]:
s_small = pd.Series(range(-127,127),dtype='Int8')
s_big = pd.Series(range(-127,127))

bytes_of_s_small = s_small.memory_usage(index=True,deep=False)
bytes_of_s_big = s_big.memory_usage(index=True,deep=False)

print(f"Small vs Big (in bytes): {bytes_of_s_small} vs {bytes_of_s_big}.")

Small vs Big: 588 vs 2112.


# DataFrames

## Initialize a Pandas DataFrame

In [86]:
df = pd.DataFrame(np.arange(16,dtype=int).reshape((4,4)),
                    columns=["col_1", "col_2", "col_3", "col_4"],
                    index=["row_1", "row_2", "row_3",'row_4'])
df

Unnamed: 0,col_1,col_2,col_3,col_4
row_1,0,1,2,3
row_2,4,5,6,7
row_3,8,9,10,11
row_4,12,13,14,15


A more organized way to do it

In [94]:
company = ["Google", "Microsoft", "Facebook", "Apple"]
founder_name = ["Larry", "Bill", "Mark", "Steve"]
founder_surname = ["Page", "Gates", "Zuckerberg", "Jobs"]

df = pd.DataFrame( [company, founder_name, founder_surname], index = ['company','founder_name','founder_surname'])
display(df)

Unnamed: 0,0,1,2,3
company,Google,Microsoft,Facebook,Apple
founder_name,Larry,Bill,Mark,Steve
founder_surname,Page,Gates,Zuckerberg,Jobs


And following the tidy data concept,

In [95]:
df.T

Unnamed: 0,company,founder_name,founder_surname
0,Google,Larry,Page
1,Microsoft,Bill,Gates
2,Facebook,Mark,Zuckerberg
3,Apple,Steve,Jobs


## DataFrames with dictionary

dictionary keys are the column names

In [96]:
tech_companies_dictionary = {
    'company': ["Google", "Microsoft", "Facebook", "Apple"],
    'founder_name': ["Larry", "Bill", "Mark", "Steve"],
    'founder_surname': ["Page", "Gates", "Zuckerberg", "Jobs"],
}

df = pd.DataFrame(tech_companies_dictionary)
df

Unnamed: 0,company,founder_name,founder_surname
0,Google,Larry,Page
1,Microsoft,Bill,Gates
2,Facebook,Mark,Zuckerberg
3,Apple,Steve,Jobs
