# Dataframes

In [3]:
import pandas as pd

In [7]:
books = [
    {
        "title": "Genetic Algorithms and Machine Learning for Programmers",
        "price": 36.99,
        "author": "Frances Buontempo"
    },
    {
        "title": "The Visual Display of Quantitative Information",
        "price": 38.00,
        "author": "Edward Tufte"
    },
    {
        "title": "Practical Object-Oriented Design",
        "author": "Sandi Metz",
        "price": 30.47
    },
    {
        "title": "Weapons of Math Destruction",
        "author": "Cathy O'Neil",
        "price": 17.44
    }
]

In [8]:
books

[{'title': 'Genetic Algorithms and Machine Learning for Programmers',
  'price': 36.99,
  'author': 'Frances Buontempo'},
 {'title': 'The Visual Display of Quantitative Information',
  'price': 38.0,
  'author': 'Edward Tufte'},
 {'title': 'Practical Object-Oriented Design',
  'author': 'Sandi Metz',
  'price': 30.47},
 {'title': 'Weapons of Math Destruction',
  'author': "Cathy O'Neil",
  'price': 17.44}]

In [9]:
# convert list of dictionaries to dataframe
books = pd.DataFrame(books)
books

Unnamed: 0,title,price,author
0,Genetic Algorithms and Machine Learning for Pr...,36.99,Frances Buontempo
1,The Visual Display of Quantitative Information,38.0,Edward Tufte
2,Practical Object-Oriented Design,30.47,Sandi Metz
3,Weapons of Math Destruction,17.44,Cathy O'Neil


In [10]:
# num of rows
len(books)

4

In [11]:
# gives (rows, columns)
books.shape

(4, 3)

In [12]:
# gives total number of fields/values/cells (rows * columns)
books.size

12

In [13]:
# sum total of all prices
# dot notation = df.column_name (to look at a certain column within a dataframe)
books.price.sum()

122.9

In [15]:
books.price

0    36.99
1    38.00
2    30.47
3    17.44
Name: price, dtype: float64

In [16]:
# each column of a dataframe is a series
type(books.price)

pandas.core.series.Series

In [17]:
# bracket notation df['column_name']
# helpful if column name has a space
books['price'].sum()

122.9

In [18]:
books['price']

0    36.99
1    38.00
2    30.47
3    17.44
Name: price, dtype: float64

In [19]:
type(books['price'])

pandas.core.series.Series

In [20]:
# avg price
books.price.mean()

30.725

In [21]:
# max price
books.price.max()

38.0

In [22]:
books.head(2)

Unnamed: 0,title,price,author
0,Genetic Algorithms and Machine Learning for Pr...,36.99,Frances Buontempo
1,The Visual Display of Quantitative Information,38.0,Edward Tufte


In [23]:
books.tail(2)

Unnamed: 0,title,price,author
2,Practical Object-Oriented Design,30.47,Sandi Metz
3,Weapons of Math Destruction,17.44,Cathy O'Neil


In [24]:
# access df index series
books.index

RangeIndex(start=0, stop=4, step=1)

In [25]:
# indexing
books[books.index == 0]

Unnamed: 0,title,price,author
0,Genetic Algorithms and Machine Learning for Pr...,36.99,Frances Buontempo


In [27]:
# can also index for specific column
books['price'][books.index == 0]

0    36.99
Name: price, dtype: float64

In [26]:
# more direct method to call for specific result
books.iloc[0]

title     Genetic Algorithms and Machine Learning for Pr...
price                                                 36.99
author                                    Frances Buontempo
Name: 0, dtype: object

In [34]:
# df.column_name.idxmax() to get index of highest value from specified column
highest_price_index = books.price.idxmax()
highest_price_index

1

In [33]:
# get highest price book
books[books.index == highest_price_index]

Unnamed: 0,title,price,author
1,The Visual Display of Quantitative Information,38.0,Edward Tufte


In [35]:
# can use iloc in same way
books.iloc[highest_price_index]

title     The Visual Display of Quantitative Information
price                                               38.0
author                                      Edward Tufte
Name: 1, dtype: object

In [36]:
# get lowest book price
books[books.index == books.price.idxmin()]

Unnamed: 0,title,price,author
3,Weapons of Math Destruction,17.44,Cathy O'Neil


In [37]:
books.index >= 2

array([False, False,  True,  True])

In [38]:
# show all the books with index 2 or more
books[books.index >= 2]

Unnamed: 0,title,price,author
2,Practical Object-Oriented Design,30.47,Sandi Metz
3,Weapons of Math Destruction,17.44,Cathy O'Neil


In [39]:
# to find something using only partial data
books[books.author.str.contains('San')]

Unnamed: 0,title,price,author
2,Practical Object-Oriented Design,30.47,Sandi Metz


In [41]:
x = pd.Series(range(1,11))
x >= 5

0    False
1    False
2    False
3    False
4     True
5     True
6     True
7     True
8     True
9     True
dtype: bool

In [42]:
x[x>=5]

4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64

In [43]:
import numpy as np

np.random.seed(123)

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# randomly generate scores for each student for each subject
# note that all the values need to have the same length here
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

type(df)

pandas.core.frame.DataFrame

In [44]:
# printed representation
# essentially how it would look in the terminal
print(df)

       name  math  english  reading
0     Sally    62       85       80
1      Jane    88       79       67
2     Suzie    94       74       95
3     Billy    98       96       88
4       Ada    77       92       98
5      John    79       76       93
6    Thomas    82       64       81
7     Marie    93       63       90
8    Albert    92       62       87
9   Richard    69       80       94
10    Isaac    92       99       93
11     Alan    92       62       72


In [45]:
# shows up differently (nicer and with html) in jupyter notebook
df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [46]:
# retrieve specific rows by name
df[df['name'] == 'Jane']

Unnamed: 0,name,math,english,reading
1,Jane,88,79,67


In [47]:
# Albert's math score?
df[df.name == 'Albert'].math

8    92
Name: math, dtype: int64

In [50]:
# step-by-step:

# get index for Albert
albert_mask = df.name == 'Albert'
albert_mask

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8      True
9     False
10    False
11    False
Name: name, dtype: bool

In [52]:
# retrieve records from Albert's index
albert_records = df[albert_mask]
albert_records

Unnamed: 0,name,math,english,reading
8,Albert,92,62,87


In [53]:
# retrieve info from math column from Albert's slice of df
albert_records.math

8    92
Name: math, dtype: int64

In [54]:
# summarizing dataframes
df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     12 non-null     object
 1   math     12 non-null     int64 
 2   english  12 non-null     int64 
 3   reading  12 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 512.0+ bytes


In [56]:
df.describe()

Unnamed: 0,math,english,reading
count,12.0,12.0,12.0
mean,84.833333,77.666667,86.5
std,11.134168,13.371158,9.643651
min,62.0,62.0,67.0
25%,78.5,63.75,80.75
50%,90.0,77.5,89.0
75%,92.25,86.75,93.25
max,98.0,99.0,98.0


In [57]:
df.describe().index

Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'], dtype='object')

In [58]:
# data type of each column
df.dtypes

name       object
math        int64
english     int64
reading     int64
dtype: object

In [59]:
# tuple of (rows, columns)
df.shape

(12, 4)

In [60]:
type(df.shape)

tuple

In [61]:
# to get number of rows
df.shape[0]

12

In [62]:
# to get number of columns
df.shape[1]

4

In [63]:
# programmatic way to access list of column names
df.columns

Index(['name', 'math', 'english', 'reading'], dtype='object')

In [64]:
# to see index
df.index

RangeIndex(start=0, stop=12, step=1)

In [66]:
# can change column name info
df.columns = [col.upper() for col in df.columns]
df.columns

Index(['NAME', 'MATH', 'ENGLISH', 'READING'], dtype='object')

In [67]:
# column names are now capitalized
df

Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [69]:
df.columns = [col.lower() for col in df.columns]

In [70]:
# subsetting dataframes

# to access multiple columns:
df[['name','math']]

Unnamed: 0,name,math
0,Sally,62
1,Jane,88
2,Suzie,94
3,Billy,98
4,Ada,77
5,John,79
6,Thomas,82
7,Marie,93
8,Albert,92
9,Richard,69


In [71]:
# can specify list variable
# df[list_var] to get subset
name_math_columns = ['name','math']
df[name_math_columns]

Unnamed: 0,name,math
0,Sally,62
1,Jane,88
2,Suzie,94
3,Billy,98
4,Ada,77
5,John,79
6,Thomas,82
7,Marie,93
8,Albert,92
9,Richard,69


In [72]:
math_df = df[name_math_columns]
math_df.head(3)

Unnamed: 0,name,math
0,Sally,62
1,Jane,88
2,Suzie,94


In [73]:
df.math

0     62
1     88
2     94
3     98
4     77
5     79
6     82
7     93
8     92
9     69
10    92
11    92
Name: math, dtype: int64

In [74]:
df['math']

0     62
1     88
2     94
3     98
4     77
5     79
6     82
7     93
8     92
9     69
10    92
11    92
Name: math, dtype: int64

In [75]:
df.head()

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98


In [76]:
df.tail(4)

Unnamed: 0,name,math,english,reading
8,Albert,92,62,87
9,Richard,69,80,94
10,Isaac,92,99,93
11,Alan,92,62,72


In [77]:
df.tail(10000)
# pulls as many as it can

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [79]:
# pulls random sample
df.sample()

Unnamed: 0,name,math,english,reading
7,Marie,93,63,90


In [80]:
df.math < 80

0      True
1     False
2     False
3     False
4      True
5      True
6     False
7     False
8     False
9      True
10    False
11    False
Name: math, dtype: bool

In [81]:
# only returns rows where math score is below 80
df[df.math < 80]

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
4,Ada,77,92,98
5,John,79,76,93
9,Richard,69,80,94


In [82]:
# for an OR statement use '|'
# parentheses are a must for clarity!
df[(df.math < 80) | (df.reading > 90)]

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
2,Suzie,94,74,95
4,Ada,77,92,98
5,John,79,76,93
9,Richard,69,80,94
10,Isaac,92,99,93


In [83]:
# for an AND statement use '&'
# parentheses always!
df[(df.math < 80) & (df.reading > 90)]

Unnamed: 0,name,math,english,reading
4,Ada,77,92,98
5,John,79,76,93
9,Richard,69,80,94


In [84]:
# can do the same using variables
# don't need parentheses if using variables
math_less_than_80 = df.math < 80
reading_greater_than_90 = df.reading > 90
df[math_less_than_80 & reading_greater_than_90]

Unnamed: 0,name,math,english,reading
4,Ada,77,92,98
5,John,79,76,93
9,Richard,69,80,94


In [85]:
# shows specified output but doesn't change source data
df.drop(columns=['name'])

# this WILL change source data: df.drop(columns=['name'], inplace = True)

Unnamed: 0,math,english,reading
0,62,85,80
1,88,79,67
2,94,74,95
3,98,96,88
4,77,92,98
5,79,76,93
6,82,64,81
7,93,63,90
8,92,62,87
9,69,80,94


In [86]:
# source data remains the same unless df is reassigned (df = ...)
df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [87]:
# make a copy of original dataframe
# always use .copy() to get a verbatim copy; creates new place in memory
# copy = df will cause changes to apply to both copy and df; pointing to same place in mem
copy = df.copy()

In [88]:
# this causes copy's data to be changed bc copy was reassigned
copy = copy.drop(columns = ['english','reading'])
copy
# reading and english columns were dropped

Unnamed: 0,name,math
0,Sally,62
1,Jane,88
2,Suzie,94
3,Billy,98
4,Ada,77
5,John,79
6,Thomas,82
7,Marie,93
8,Albert,92
9,Richard,69


In [89]:
copy.drop(columns = ['math'], inplace = True)

In [91]:
# math column is removed
copy

Unnamed: 0,name
0,Sally
1,Jane
2,Suzie
3,Billy
4,Ada
5,John
6,Thomas
7,Marie
8,Albert
9,Richard


In [90]:
# original source data remains unchanged
df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [92]:
# allows you to rename columns
df.rename(columns = {'name': 'student', 'math': 'mathematics'})
# allows you to view the changes, but doesn't apply them to df
# to make permanent change to columns, reassign variable or use inplace = True

Unnamed: 0,student,mathematics,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [93]:
# changes were not applied to df
df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [96]:
# can chain methods together
df.drop(columns = ['english', 'reading']).rename(columns = {'name': 'student'})

Unnamed: 0,student,math
0,Sally,62
1,Jane,88
2,Suzie,94
3,Billy,98
4,Ada,77
5,John,79
6,Thomas,82
7,Marie,93
8,Albert,92
9,Richard,69


In [97]:
df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [98]:
# creating new columns

# did student pass math?
df.math >= 70

0     False
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9     False
10     True
11     True
Name: math, dtype: bool

In [104]:
# can assign array to new column in df if they are same size and shape
df['passing_math'] = df.math >= 70

In [107]:
# new column 'passing_math' is created
df

Unnamed: 0,name,math,english,reading,passing_math
0,Sally,62,85,80,False
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
5,John,79,76,93,True
6,Thomas,82,64,81,True
7,Marie,93,63,90,True
8,Albert,92,62,87,True
9,Richard,69,80,94,False


In [108]:
df[df['passing_math']]

Unnamed: 0,name,math,english,reading,passing_math
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
5,John,79,76,93,True
6,Thomas,82,64,81,True
7,Marie,93,63,90,True
8,Albert,92,62,87,True
10,Isaac,92,99,93,True
11,Alan,92,62,72,True


In [109]:
df.passing_math.sum()

10

In [110]:
df.passing_math.mean()

0.8333333333333334

In [111]:
# .assign() shows new column data but it is not permanently in df
df.assign(passing_english = df.english >= 70)

Unnamed: 0,name,math,english,reading,passing_math,passing_english
0,Sally,62,85,80,False,True
1,Jane,88,79,67,True,True
2,Suzie,94,74,95,True,True
3,Billy,98,96,88,True,True
4,Ada,77,92,98,True,True
5,John,79,76,93,True,True
6,Thomas,82,64,81,True,False
7,Marie,93,63,90,True,False
8,Albert,92,62,87,True,False
9,Richard,69,80,94,False,True


In [112]:
# if assigning column to one thing, everything in that series is equal to that one thing
df['school'] = 'S.A. HS'
df

Unnamed: 0,name,math,english,reading,passing_math,school
0,Sally,62,85,80,False,S.A. HS
1,Jane,88,79,67,True,S.A. HS
2,Suzie,94,74,95,True,S.A. HS
3,Billy,98,96,88,True,S.A. HS
4,Ada,77,92,98,True,S.A. HS
5,John,79,76,93,True,S.A. HS
6,Thomas,82,64,81,True,S.A. HS
7,Marie,93,63,90,True,S.A. HS
8,Albert,92,62,87,True,S.A. HS
9,Richard,69,80,94,False,S.A. HS


In [113]:
# similar to SQL ORDER BY
# default is ascending order
df.sort_values(by = 'english')

Unnamed: 0,name,math,english,reading,passing_math,school
8,Albert,92,62,87,True,S.A. HS
11,Alan,92,62,72,True,S.A. HS
7,Marie,93,63,90,True,S.A. HS
6,Thomas,82,64,81,True,S.A. HS
2,Suzie,94,74,95,True,S.A. HS
5,John,79,76,93,True,S.A. HS
1,Jane,88,79,67,True,S.A. HS
9,Richard,69,80,94,False,S.A. HS
0,Sally,62,85,80,False,S.A. HS
4,Ada,77,92,98,True,S.A. HS


In [114]:
# to change sort order to descending:
df.sort_values(by = 'english', ascending = False)

Unnamed: 0,name,math,english,reading,passing_math,school
10,Isaac,92,99,93,True,S.A. HS
3,Billy,98,96,88,True,S.A. HS
4,Ada,77,92,98,True,S.A. HS
0,Sally,62,85,80,False,S.A. HS
9,Richard,69,80,94,False,S.A. HS
1,Jane,88,79,67,True,S.A. HS
5,John,79,76,93,True,S.A. HS
2,Suzie,94,74,95,True,S.A. HS
6,Thomas,82,64,81,True,S.A. HS
7,Marie,93,63,90,True,S.A. HS


In [116]:
# can chain dataframe methods

# chain of string methods:
'hello'.capitalize().swapcase()

'hELLO'

In [125]:
# can chain df methods if each method returns a df or other datatype
# chaining reads from left to right
# name of student with lowest english grade above a 90
df[df.english > 90].sort_values(by = 'english').head(1).name

4    Ada
Name: name, dtype: object

In [118]:
# step-by-step:

# students with an A in english
df[df.english > 90]

Unnamed: 0,name,math,english,reading,passing_math,school
3,Billy,98,96,88,True,S.A. HS
4,Ada,77,92,98,True,S.A. HS
10,Isaac,92,99,93,True,S.A. HS


In [120]:
# sort results by english grade
df[df.english > 90].sort_values(by = 'english')

Unnamed: 0,name,math,english,reading,passing_math,school
4,Ada,77,92,98,True,S.A. HS
3,Billy,98,96,88,True,S.A. HS
10,Isaac,92,99,93,True,S.A. HS


In [121]:
# only show top result from sorted values
df[df.english > 90].sort_values(by = 'english').head(1)

Unnamed: 0,name,math,english,reading,passing_math,school
4,Ada,77,92,98,True,S.A. HS


In [122]:
# extract only name part of record
df[df.english > 90].sort_values(by = 'english').head(1).name

4    Ada
Name: name, dtype: object