In [1]:
import numpy as np

In [2]:
import pandas as pd

## DataFrame

    - Is a 2-dimensional labeled data structure with columns of potentially different types.
    - Think of it like a spreadsheet or SQL table, or a dictionary of Series objects
    - The most commonly used Pandas object

In [78]:
# To demonstrate this, let us construct a Series listing much like the previous lecture

df = pd.DataFrame(np.random.rand(5,5),['R1','R2','R3','R4','R5'],['C1','C2','C3','C4','C5'])
df

Unnamed: 0,C1,C2,C3,C4,C5
R1,0.366603,0.503851,0.318404,0.499654,0.777916
R2,0.625001,0.470069,0.367101,0.023238,0.360522
R3,0.480321,0.634994,0.926654,0.232327,0.093469
R4,0.558536,0.793869,0.945158,0.899596,0.823827
R5,0.79419,0.112965,0.685917,0.828156,0.338256


### DataFrame as a specialized dictionary

In [4]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [5]:
population_dict = {'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [6]:
code_dict = {'California': 'CA', 'Texas': 'TX', 'New York': 'NY', 'Florida': 'FL', 'Illinois': 'IL'}
code = pd.Series(code_dict)
code

California    CA
Texas         TX
New York      NY
Florida       FL
Illinois      IL
dtype: object

In [7]:
# For checking purposes

type(population)

pandas.core.series.Series

In [8]:
# Now that we have both Series, we can use a dictionary to construct a single two-dimensional object containing this information

In [9]:
states = pd.DataFrame({'area': area, 'population': population, 'code': code})
states

Unnamed: 0,area,population,code
California,423967,38332521,CA
Texas,695662,26448193,TX
New York,141297,19651127,NY
Florida,170312,19552860,FL
Illinois,149995,12882135,IL


In [10]:
# We can use type to check that if it is indeed a data frame

type(states)

pandas.core.frame.DataFrame

In [11]:
# Like a Series object, a DataFrame has also an index attribute and it gives access to the index labels

states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [12]:
# It has also a columns attribute

states.columns

Index(['area', 'population', 'code'], dtype='object')

In [13]:
# You can think of a DataFrame as a generalization of a 2-dimensional NumPy array, where both rows and columns
# have indices for accessing data

In [14]:
# Asking for the 'area' attributes returns the Series object containing the 'areas'

states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

## Constructing DataFrame objects

In [15]:
# A Pandas DataFrame can be constructed in a various ways

### From a single Series object

In [16]:
# Remember that a DataFrame is a collection of Series objects
# Single column DataFrame can be built using a Single Series objects

pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


### From a list of dictionaries

In [17]:
# Note that any list of dictionaries can be a basis of a DataFrame

data = [{'a': i,'b': i**2} for i in range(3)]
data

[{'a': 0, 'b': 0}, {'a': 1, 'b': 1}, {'a': 2, 'b': 4}]

In [18]:
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,1
2,2,4


### From a dictionary of Series Objects

In [19]:
# You already saw this one earlier

type(area)

pandas.core.series.Series

In [20]:
type(population)

pandas.core.series.Series

In [21]:
pd.DataFrame({'populaton': population, 'area': area, 'code': code})

Unnamed: 0,populaton,area,code
California,38332521,423967,CA
Texas,26448193,695662,TX
New York,19651127,141297,NY
Florida,19552860,170312,FL
Illinois,12882135,149995,IL


### From a a two-dimensional NumPy array

In [22]:
# Given a 2-dimensional array of data we can also create a DataFrame
# Let us use the columns and index property here

pd.DataFrame(np.random.randn(3,2), columns=['column 1','column 2'], index=['row 1', 'row 2', 'row 3'])

Unnamed: 0,column 1,column 2
row 1,0.899663,-1.000264
row 2,0.503857,-1.175122
row 3,0.632132,0.883688


## Data Selection in DataFrame

In [23]:
# If you are familiar with accessing values of a NumPy array or Series this will look similar

In [24]:
states

Unnamed: 0,area,population,code
California,423967,38332521,CA
Texas,695662,26448193,TX
New York,141297,19651127,NY
Florida,170312,19552860,FL
Illinois,149995,12882135,IL


In [25]:
# The Series that corresponds to the columns of the DataFrame can be accessed via dictionary-style indexing of the column names

states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [26]:
# As an alternative (not recommended)

states.area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [27]:
# You can alos use .iloc (implicit) - integer-based

states

Unnamed: 0,area,population,code
California,423967,38332521,CA
Texas,695662,26448193,TX
New York,141297,19651127,NY
Florida,170312,19552860,FL
Illinois,149995,12882135,IL


In [28]:
states.iloc[:3,:2]

Unnamed: 0,area,population
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [29]:
states.iloc[2:3,:]

Unnamed: 0,area,population,code
New York,141297,19651127,NY


In [30]:
states.iloc[2,:]

area            141297
population    19651127
code                NY
Name: New York, dtype: object

In [31]:
# You can also use the .loc (explicit)

states.loc[:'New York','population':]

Unnamed: 0,population,code
California,38332521,CA
Texas,26448193,TX
New York,19651127,NY


In [32]:
# Keep in mind that a DataFrame is an enhanced two-dimensional array thus we can also transpose the full DataFrame

np.transpose(states)

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967,695662,141297,170312,149995
population,38332521,26448193,19651127,19552860,12882135
code,CA,TX,NY,FL,IL


In [33]:
states.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967,695662,141297,170312,149995
population,38332521,26448193,19651127,19552860,12882135
code,CA,TX,NY,FL,IL


In [34]:
states

Unnamed: 0,area,population,code
California,423967,38332521,CA
Texas,695662,26448193,TX
New York,141297,19651127,NY
Florida,170312,19552860,FL
Illinois,149995,12882135,IL


In [35]:
# You can also do conditional selections

states[(states['area'] < 200000)]

Unnamed: 0,area,population,code
New York,141297,19651127,NY
Florida,170312,19552860,FL
Illinois,149995,12882135,IL


In [36]:
# You can also do this (conditional selections)

states[(states['area'] < 200000) & (states['population'] < 200000000)]

Unnamed: 0,area,population,code
New York,141297,19651127,NY
Florida,170312,19552860,FL
Illinois,149995,12882135,IL


In [37]:
# Also, you can also select specific columns to display

states[(states['area'] < 200000) & (states['population'] < 200000000)][['area','code']]

Unnamed: 0,area,code
New York,141297,NY
Florida,170312,FL
Illinois,149995,IL


### Creating new columns in a DataFrame

In [38]:
states

Unnamed: 0,area,population,code
California,423967,38332521,CA
Texas,695662,26448193,TX
New York,141297,19651127,NY
Florida,170312,19552860,FL
Illinois,149995,12882135,IL


In [39]:
# You can create new columns to an existing DataFrames by

states['density'] = states['population'] / states['area']
states

Unnamed: 0,area,population,code,density
California,423967,38332521,CA,90.413926
Texas,695662,26448193,TX,38.01874
New York,141297,19651127,NY,139.076746
Florida,170312,19552860,FL,114.806121
Illinois,149995,12882135,IL,85.883763


In [40]:
# You can also delete (drop) a column by .drop(), this will not happen inplace

states.drop('density', axis = 1) # axis 1 is column

Unnamed: 0,area,population,code
California,423967,38332521,CA
Texas,695662,26448193,TX
New York,141297,19651127,NY
Florida,170312,19552860,FL
Illinois,149995,12882135,IL


In [41]:
# You can also delete (drop) a row by .drop(), this will not happen inplace

states.drop('Texas', axis = 0) # axis 0 is row

Unnamed: 0,area,population,code,density
California,423967,38332521,CA,90.413926
New York,141297,19651127,NY,139.076746
Florida,170312,19552860,FL,114.806121
Illinois,149995,12882135,IL,85.883763


In [42]:
# Inplace Parameter for Permanent drop

states

Unnamed: 0,area,population,code,density
California,423967,38332521,CA,90.413926
Texas,695662,26448193,TX,38.01874
New York,141297,19651127,NY,139.076746
Florida,170312,19552860,FL,114.806121
Illinois,149995,12882135,IL,85.883763


In [43]:
# Adding new columns to a DataFrame

states['new'] = states['area'] + states['population']
states

Unnamed: 0,area,population,code,density,new
California,423967,38332521,CA,90.413926,38756488
Texas,695662,26448193,TX,38.01874,27143855
New York,141297,19651127,NY,139.076746,19792424
Florida,170312,19552860,FL,114.806121,19723172
Illinois,149995,12882135,IL,85.883763,13032130


In [44]:
# You will provide the inplace parameter to drop columns or row permanently

states.drop('new', axis = 1, inplace = True)

In [45]:
states

Unnamed: 0,area,population,code,density
California,423967,38332521,CA,90.413926
Texas,695662,26448193,TX,38.01874
New York,141297,19651127,NY,139.076746
Florida,170312,19552860,FL,114.806121
Illinois,149995,12882135,IL,85.883763


In [46]:
states

Unnamed: 0,area,population,code,density
California,423967,38332521,CA,90.413926
Texas,695662,26448193,TX,38.01874
New York,141297,19651127,NY,139.076746
Florida,170312,19552860,FL,114.806121
Illinois,149995,12882135,IL,85.883763


In [47]:
states.drop('Florida', axis = 0)

Unnamed: 0,area,population,code,density
California,423967,38332521,CA,90.413926
Texas,695662,26448193,TX,38.01874
New York,141297,19651127,NY,139.076746
Illinois,149995,12882135,IL,85.883763


In [48]:
states

Unnamed: 0,area,population,code,density
California,423967,38332521,CA,90.413926
Texas,695662,26448193,TX,38.01874
New York,141297,19651127,NY,139.076746
Florida,170312,19552860,FL,114.806121
Illinois,149995,12882135,IL,85.883763


In [49]:
states.drop('Florida', axis = 0, inplace = True) # drop 'Florida' key permanently (inplace)

In [50]:
states

Unnamed: 0,area,population,code,density
California,423967,38332521,CA,90.413926
Texas,695662,26448193,TX,38.01874
New York,141297,19651127,NY,139.076746
Illinois,149995,12882135,IL,85.883763


In [52]:
# return the shape of a DataFrame

states.shape

(4, 4)

Unnamed: 0,C1,C2,C3,C4,C5
R1,0.0489,0.328443,0.986177,0.223198,0.11517
R2,0.936125,0.35604,0.251232,0.20257,0.037655
R3,0.568009,0.747464,0.923219,0.486151,0.607783
R4,0.889029,0.557924,0.156068,0.880931,0.873786
R5,0.289699,0.687095,0.542514,0.237057,0.479473


In [79]:
df

Unnamed: 0,C1,C2,C3,C4,C5
R1,0.366603,0.503851,0.318404,0.499654,0.777916
R2,0.625001,0.470069,0.367101,0.023238,0.360522
R3,0.480321,0.634994,0.926654,0.232327,0.093469
R4,0.558536,0.793869,0.945158,0.899596,0.823827
R5,0.79419,0.112965,0.685917,0.828156,0.338256


In [75]:
# Remember selecting a single columns in a DataFrame is a Series

type(df['C1'])

pandas.core.series.Series

In [77]:
# Much like selecting a single row is a Series also

type(df.loc['R3'])

pandas.core.series.Series