# Intro to Pandas
by Ryan Orsinger

## Module 2: Introducing DataFrames

### Pandas DataFrames Basics - Part 1
- How to make pandas DataFrames from other Python collections
- Learning about your dataframe's properties and information
- Selecting a single column
- Adding new columns to a DataFrame
- Selecting multiple columns

In [1]:
import pandas as pd

In [2]:
# Creating a dataframe from a list of dictionaries
basket = [
    {"item": "mango", "quantity": 4, "price": 2.99},
    {"item": "bread", "quantity": 2, "price": 3.25},
    {"item": "juice", "quantity": 1, "price": 5.90},
    {"item": "orange", "quantity": 3, "price": 2.99},
    {"item": "lime", "quantity": 3, "price": 0.3},
]
basket

[{'item': 'mango', 'quantity': 4, 'price': 2.99},
 {'item': 'bread', 'quantity': 2, 'price': 3.25},
 {'item': 'juice', 'quantity': 1, 'price': 5.9},
 {'item': 'orange', 'quantity': 3, 'price': 2.99},
 {'item': 'lime', 'quantity': 3, 'price': 0.3}]

In [3]:
# With dataframes, our columns are our variables or features
# Each row represents a unique observation (usually)
df = pd.DataFrame(basket)
df

Unnamed: 0,item,quantity,price
0,mango,4,2.99
1,bread,2,3.25
2,juice,1,5.9
3,orange,3,2.99
4,lime,3,0.3


In [4]:
# Creating a dataframe from a dictionary of lists
basket = {
    "item": ["mango", "bread", "juice", "orange", "lime"],
    "quantity": [4, 2, 1, 3, 3],
    "price": [2.99, 3.25, 5.90, 2.99, 0.30]
}
basket

{'item': ['mango', 'bread', 'juice', 'orange', 'lime'],
 'quantity': [4, 2, 1, 3, 3],
 'price': [2.99, 3.25, 5.9, 2.99, 0.3]}

In [5]:
pd.DataFrame(basket)

Unnamed: 0,item,quantity,price
0,mango,4,2.99
1,bread,2,3.25
2,juice,1,5.9
3,orange,3,2.99
4,lime,3,0.3


In [6]:
# Creating a dataframe from a list of lists
example = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]

column_names = ["variable_a", "variable_b", "variable_c"]
row_names = ["observation_1", "observation_2", "observation_3"]

pd.DataFrame(example, columns=column_names, index=row_names)

Unnamed: 0,variable_a,variable_b,variable_c
observation_1,1,2,3
observation_2,4,5,6
observation_3,7,8,9


In [33]:
# Creating an empty dataframe
df = pd.DataFrame()

# Adding columns to a dataframe
df["item"] = ["Lango", "Lread", "Luice", "Lrange", "Lime"]
df["quantity"] = [2, 2, 1, 3, 3]
df["price"] = [2.99, 3.25, 5.90, 2.99, 0.30]
df

Unnamed: 0,item,quantity,price
0,mango,2,2.99
1,bread,2,3.25
2,juice,1,5.9
3,orange,3,2.99
4,lime,3,0.3


In [40]:
# .shape returns rows, columns
df.shape

(5, 3)

In [41]:
# Returns the rows
df.shape[0]

5

In [42]:
# Returns the number of columns
df.shape[1]

3

In [43]:
# len returns number of rows
len(df)

5

In [35]:
# size returns rows * columns
df.size

15

In [8]:
# Adding new columns to the dataframe
df["subtotal"] = df["quantity"] * df["price"]
df

Unnamed: 0,item,quantity,price,subtotal
0,mango,2,2.99,5.98
1,bread,2,3.25,6.5
2,juice,1,5.9,5.9
3,orange,3,2.99,8.97
4,lime,3,0.3,0.9


In [44]:
# set_index can overwrite the default index
df.set_index("item", inplace=True)
df

Unnamed: 0_level_0,quantity,price
item,Unnamed: 1_level_1,Unnamed: 2_level_1
mango,2,2.99
bread,2,3.25
juice,1,5.9
orange,3,2.99
lime,3,0.3


In [48]:
# Accessing the index values
df.index

Index(['mango', 'bread', 'juice', 'orange', 'lime'], dtype='object', name='item')

In [52]:
# We can also overwrite the index with a Series of equal length
df.index = df.index.str.lower()
df

Unnamed: 0_level_0,quantity,price
item,Unnamed: 1_level_1,Unnamed: 2_level_1
mango,2,2.99
bread,2,3.25
juice,1,5.9
orange,3,2.99
lime,3,0.3


In [11]:
# Accessing all the columns
df.columns

Index(['quantity', 'price', 'subtotal'], dtype='object')

In [12]:
# Another example of creating a new column
df["tax"] = 0.07
df

Unnamed: 0_level_0,quantity,price,subtotal,tax
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mango,2,2.99,5.98,0.07
bread,2,3.25,6.5,0.07
juice,1,5.9,5.9,0.07
orange,3,2.99,8.97,0.07
lime,3,0.3,0.9,0.07


In [13]:
# The "total" column does not exist, but this bracket syntax creates it.
df["total"] = df["subtotal"] + (df["subtotal"] * df["tax"])
df

Unnamed: 0_level_0,quantity,price,subtotal,tax,total
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mango,2,2.99,5.98,0.07,6.3986
bread,2,3.25,6.5,0.07,6.955
juice,1,5.9,5.9,0.07,6.313
orange,3,2.99,8.97,0.07,9.5979
lime,3,0.3,0.9,0.07,0.963


In [14]:
# Dot syntax also allows for calling an existing column
df.total

item
mango     6.3986
bread     6.9550
juice     6.3130
orange    9.5979
lime      0.9630
Name: total, dtype: float64

In [15]:
# Accessing a single column using dot synax
df.total = df.total.round(2)
df

Unnamed: 0_level_0,quantity,price,subtotal,tax,total
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
mango,2,2.99,5.98,0.07,6.4
bread,2,3.25,6.5,0.07,6.96
juice,1,5.9,5.9,0.07,6.31
orange,3,2.99,8.97,0.07,9.6
lime,3,0.3,0.9,0.07,0.96


In [16]:
# .dtypes outputs the datatypes of all columns in the dataframe
df.dtypes

quantity      int64
price       float64
subtotal    float64
tax         float64
total       float64
dtype: object

In [17]:
# .info returns datatype and non-null count
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, mango to lime
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   quantity  5 non-null      int64  
 1   price     5 non-null      float64
 2   subtotal  5 non-null      float64
 3   tax       5 non-null      float64
 4   total     5 non-null      float64
dtypes: float64(4), int64(1)
memory usage: 412.0+ bytes


In [19]:
df.price.describe()

count    5.000000
mean     3.086000
std      1.982783
min      0.300000
25%      2.990000
50%      2.990000
75%      3.250000
max      5.900000
Name: price, dtype: float64

In [20]:
# Show descriptive stats for numeric columns
df.describe()

Unnamed: 0,quantity,price,subtotal,tax,total
count,5.0,5.0,5.0,5.0,5.0
mean,2.2,3.086,5.65,0.07,6.046
std,0.83666,1.982783,2.935933,0.0,3.143593
min,1.0,0.3,0.9,0.07,0.96
25%,2.0,2.99,5.9,0.07,6.31
50%,2.0,2.99,5.98,0.07,6.4
75%,3.0,3.25,6.5,0.07,6.96
max,3.0,5.9,8.97,0.07,9.6


In [21]:
# A column in a dataframe is a series
type(df.quantity)

pandas.core.series.Series

In [22]:
# .value_counts returns a series
df.quantity.value_counts()

2    2
3    2
1    1
Name: quantity, dtype: int64

In [23]:
# Aggregate functions can run on all the numeric values in the dataframe
df.mean()

quantity    2.200
price       3.086
subtotal    5.650
tax         0.070
total       6.046
dtype: float64

In [24]:
# Obtain the median of all numeric columns
df.median()

quantity    2.00
price       2.99
subtotal    5.98
tax         0.07
total       6.40
dtype: float64

In [30]:
# Standard deviation of all the numeric columns
df.std()

quantity    0.836660
price       1.982783
subtotal    2.935933
tax         0.000000
total       3.143593
dtype: float64