In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
np.random.seed(123)

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# randomly generate scores for each student for each subject
# note that all the values need to have the same length here
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

type(df)

pandas.core.frame.DataFrame

In [4]:
# a dataframe is composed of series
# each series is a column
# but the df itself is more than a list ofseries
# the df itself has its own functionality on top

df

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [5]:
#df returns that specific column
df.name

0       Sally
1        Jane
2       Suzie
3       Billy
4         Ada
5        John
6      Thomas
7       Marie
8      Albert
9     Richard
10      Isaac
11       Alan
Name: name, dtype: object

In [6]:
#the dataframe itself has its own method and functionality

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     12 non-null     object
 1   math     12 non-null     int64 
 2   english  12 non-null     int64 
 3   reading  12 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 512.0+ bytes


In [7]:
df.dtypes

name       object
math        int64
english     int64
reading     int64
dtype: object

In [8]:
df.columns

Index(['name', 'math', 'english', 'reading'], dtype='object')

In [9]:
df.index

RangeIndex(start=0, stop=12, step=1)

In [10]:
#.describe provides summary stats on our numeric columns
df.describe()

Unnamed: 0,math,english,reading
count,12.0,12.0,12.0
mean,84.833333,77.666667,86.5
std,11.134168,13.371158,9.643651
min,62.0,62.0,67.0
25%,78.5,63.75,80.75
50%,90.0,77.5,89.0
75%,92.25,86.75,93.25
max,98.0,99.0,98.0


## Why dataFrames?
- rectangular data consisting of columns and rows
- dataframes enjoy functionality above/beyond series
- dataframes are a container for series

In [11]:
df.columns = [column.upper() for column in df.columns]
df

Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
5,John,79,76,93
6,Thomas,82,64,81
7,Marie,93,63,90
8,Albert,92,62,87
9,Richard,69,80,94


In [12]:
#it's possible to assign a column to be the index (if we don't have duplicates)
##df.index = df.NAME
##df

In [13]:
##df = pd.DataFrame({'name': students,
                  # 'math': math_grades,
                   #'english': english_grades,
                  # 'reading': reading_grades}, index=students)

##df.head()

In [14]:
##students.append('sally')
##students

In [15]:
##df

In [16]:
# how to access a specific column
# df.column_name
# df["column_name"]
# df.math.head()

In [26]:
df.MATH

0     62
1     88
2     94
3     98
4     77
5     79
6     82
7     93
8     92
9     69
10    92
11    92
Name: MATH, dtype: int64

In [27]:
df["MATH"].head()

0    62
1    88
2    94
3    98
4    77
Name: MATH, dtype: int64

In [18]:
df["cohort"] = 'Jemison'
df

Unnamed: 0,NAME,MATH,ENGLISH,READING,cohort
0,Sally,62,85,80,Jemison
1,Jane,88,79,67,Jemison
2,Suzie,94,74,95,Jemison
3,Billy,98,96,88,Jemison
4,Ada,77,92,98,Jemison
5,John,79,76,93,Jemison
6,Thomas,82,64,81,Jemison
7,Marie,93,63,90,Jemison
8,Albert,92,62,87,Jemison
9,Richard,69,80,94,Jemison


In [19]:
df["campus"] = 'San Antonio'

In [20]:
df

Unnamed: 0,NAME,MATH,ENGLISH,READING,cohort,campus
0,Sally,62,85,80,Jemison,San Antonio
1,Jane,88,79,67,Jemison,San Antonio
2,Suzie,94,74,95,Jemison,San Antonio
3,Billy,98,96,88,Jemison,San Antonio
4,Ada,77,92,98,Jemison,San Antonio
5,John,79,76,93,Jemison,San Antonio
6,Thomas,82,64,81,Jemison,San Antonio
7,Marie,93,63,90,Jemison,San Antonio
8,Albert,92,62,87,Jemison,San Antonio
9,Richard,69,80,94,Jemison,San Antonio


In [28]:
#show students who are making A's in math
df.MATH >= 90


0     False
1     False
2      True
3      True
4     False
5     False
6     False
7      True
8      True
9     False
10     True
11     True
Name: MATH, dtype: bool

In [29]:
#Select * from df where math >= 90 such as in SQL

df[df.MATH >= 90]

Unnamed: 0,NAME,MATH,ENGLISH,READING,cohort,campus
2,Suzie,94,74,95,Jemison,San Antonio
3,Billy,98,96,88,Jemison,San Antonio
7,Marie,93,63,90,Jemison,San Antonio
8,Albert,92,62,87,Jemison,San Antonio
10,Isaac,92,99,93,Jemison,San Antonio
11,Alan,92,62,72,Jemison,San Antonio


In [31]:
# to view only a specific set of columns:
columns = ["NAME","MATH","READING","ENGLISH"]

df[columns].head()

Unnamed: 0,NAME,MATH,READING,ENGLISH
0,Sally,62,80,85
1,Jane,88,67,79
2,Suzie,94,95,74
3,Billy,98,88,96
4,Ada,77,98,92


In [33]:
# how to drop columns
cols_to_drop = ['campus','cohort']

#since drop doesn't change the original dataframe, we'll reassign the df

df = df.drop(columns=cols_to_drop).head()
df.head()

Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98


In [36]:
#if you have a series/array of booleans, you can filter your results
df[(df.NAME == 'Jane') | (df.NAME == 'Ada')]

Unnamed: 0,NAME,MATH,ENGLISH,READING
1,Jane,88,79,67
4,Ada,77,92,98


In [39]:
df.sort_values(by="ENGLISH", ascending=False)

Unnamed: 0,NAME,MATH,ENGLISH,READING
3,Billy,98,96,88
4,Ada,77,92,98
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95


In [40]:
"bANANA".swapcase().lower().swapcase()

'BANANA'

In [47]:
df.sort_values(by='MATH').sort_values(by='ENGLISH').head(3).head(2)

Unnamed: 0,NAME,MATH,ENGLISH,READING
2,Suzie,94,74,95
1,Jane,88,79,67


In [48]:
#renaming columns using a dictionary

In [49]:
df.head()

Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98


In [52]:
cols_to_rename = {
    "MATH": "math_grade",
    "READING": "reading_grade",
    "ENGLISH": "english_grade",
    "NAME": "Name"

}
cols_to_rename

{'MATH': 'math_grade',
 'READING': 'reading_grade',
 'ENGLISH': 'english_grade',
 'NAME': 'Name'}

In [53]:
df = df.rename(columns=cols_to_rename)
df

Unnamed: 0,Name,math_grade,english_grade,reading_grade
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98
