Data frames are the heart of the pandas library. Conceptually they are a 2 dimensional data structure with rows and columns, where each row is a series and each columnt is also a series. They are very similar to a spreadsheet or a SQL table.

# Example

In [2]:
import pandas as pd

record1 = pd.Series({'Name': 'Alice',
                        'Class': 'Physics',
                        'Score': 85})
record2 = pd.Series({'Name': 'Jack',
                        'Class': 'Chemistry',
                        'Score': 82})
record3 = pd.Series({'Name': 'Helen',
                        'Class': 'Biology',
                        'Score': 90})

In [3]:
df = pd.DataFrame([record1, record2, record3], ['school1', 'school2', 'school1'])

df.head()

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


An alternative method is using a list of dictionaries. Each dictionary represents a row in the data frame. The keys of the dictionary represent the column names and the values represent the data in the row.

In [4]:
students = [{'Name': 'Alice', 'Class': 'Physics', 'Score': 85},
            {'Name': 'Jack', 'Class': 'Chemistry', 'Score': 82},
            {'Name': 'Helen', 'Class': 'Biology', 'Score': 90}]

df = pd.DataFrame(students, index=['school1', 'school2', 'school1'])

df.head()

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [5]:
df.loc['school2']

Name          Jack
Class    Chemistry
Score           82
Name: school2, dtype: object

In [6]:
type(df.loc['school2']) #return as a series

pandas.core.series.Series

In [7]:
df.loc['school1']

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school1,Helen,Biology,90


In [8]:
type(df.loc['school1']) #return as a dataframe

pandas.core.frame.DataFrame

# We can select data based on multiple access

In [9]:
df.loc['school1', "Name"]

school1    Alice
school1    Helen
Name: Name, dtype: object

In [10]:
df.loc['school1']['Name']

school1    Alice
school1    Helen
Name: Name, dtype: object

In [11]:
df.T

Unnamed: 0,school1,school2,school1.1
Name,Alice,Jack,Helen
Class,Physics,Chemistry,Biology
Score,85,82,90


In [12]:
df.T.loc['Name']

school1    Alice
school2     Jack
school1    Helen
Name: Name, dtype: object

# In pandas the index operator [] is used to access columns of a data frame. The index operator can only be used to access columns, not rows.

In [13]:
df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [14]:
df['Name'] #we cannot use df.loc['Name'] because it is not a label

school1    Alice
school2     Jack
school1    Helen
Name: Name, dtype: object

In [15]:
# the result of a column projection is a Series
type(df['Name']) 

pandas.core.series.Series

## Since the result of using the index operator is a series, we can use the index operator again to access data in the series. We can basically chain operations together.

In [16]:
df.loc['school1']['Name']

school1    Alice
school1    Helen
Name: Name, dtype: object

In [17]:
df.loc['school1'] #it is a dataframe

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school1,Helen,Biology,90


In [18]:
print(type(df.loc['school1'])) #it is a dataframe

<class 'pandas.core.frame.DataFrame'>


In [19]:
df.loc['school1']['Name'] #it is a series

school1    Alice
school1    Helen
Name: Name, dtype: object

In [20]:
print(type(df.loc['school1']['Name'])) #it is a series

<class 'pandas.core.series.Series'>


# Chaining, by indexing on the return type of another index, can come with some costs. 

When chaining operations pandas tend to return a copy of the data frame instead of a view on the data frame. For large data frames this can become very slow and can cause performance issues.

In [21]:
df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [22]:
# if we want to select all the rows, we can use a slice and the select the particular columns we are interested in

df.loc[:, ['Name', 'Score']]

Unnamed: 0,Name,Score
school1,Alice,85
school2,Jack,82
school1,Helen,90


# MORAL OF THE STORY: TRY TO AVOID CHAINING OPERATIONS IN PANDAS

# Deleting data by using the drop method

In [23]:
df.drop('school1')

Unnamed: 0,Name,Class,Score
school2,Jack,Chemistry,82


In [24]:
df # the drop method does not change the original dataframe

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


# to change the data frame we need to use the optional parameter inplace=True in the drop method

In [36]:
copy_df = df.copy()
copy_df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [37]:
copy_df.drop('Name', inplace=True, axis=1) #axis=1 means we are dropping a column. axis=0 means we are dropping a row
copy_df

Unnamed: 0,Class,Score
school1,Physics,85
school2,Chemistry,82
school1,Biology,90


In [38]:
# There is a second method to drop a column. But immediately change the original dataframe.

del copy_df['Class']
copy_df

Unnamed: 0,Score
school1,85
school2,82
school1,90


# Adding a column to a data frame

In [41]:
df['ClassRanking'] = None
df

Unnamed: 0,Name,Class,Score,ClassRanking
school1,Alice,Physics,85,
school2,Jack,Chemistry,82,
school1,Helen,Biology,90,
