In [1]:
import pandas as pd
import numpy as np

In [2]:
np.random.seed(123)

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

In [5]:
# randomly generate scores for each student for each subject
# note that all the values need to have the same length here

math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

type(df)

pandas.core.frame.DataFrame

In [6]:
df

Unnamed: 0,name,math,english,reading
0,Sally,63,90,80
1,Jane,65,66,72
2,Suzie,60,86,78
3,Billy,71,76,77
4,Ada,94,66,61
5,John,70,74,87
6,Thomas,82,99,82
7,Marie,73,71,63
8,Albert,78,67,63
9,Richard,96,61,71


In [7]:
# view information about a dataframe

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     12 non-null     object
 1   math     12 non-null     int64 
 2   english  12 non-null     int64 
 3   reading  12 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 512.0+ bytes


In [8]:
# summary of various numerical values

df.describe()

Unnamed: 0,math,english,reading
count,12.0,12.0,12.0
mean,76.166667,78.166667,75.0
std,11.68397,12.882923,8.923921
min,60.0,61.0,61.0
25%,68.75,66.75,69.0
50%,74.0,75.0,77.5
75%,83.25,87.0,81.25
max,96.0,99.0,87.0


## Dataframe Attributes 

- dtypes - data types (the data types present in the dataframe)
- shape - number of rows by number of columns
- columns - list of column names 
- index - label for each row (autogenereated as an index)

In [10]:
#view our data types 

df.dtypes

name       object
math        int64
english     int64
reading     int64
dtype: object

In [11]:
#returns row count by column count

df.shape

(12, 4)

In [12]:
#return list of column names

df.columns

Index(['name', 'math', 'english', 'reading'], dtype='object')

In [13]:
#return information about row indices

df.index

RangeIndex(start=0, stop=12, step=1)

In [14]:
#rename columns to uppercase

df.columns = [col.upper() for col in df.columns]

In [15]:
df

Unnamed: 0,NAME,MATH,ENGLISH,READING
0,Sally,63,90,80
1,Jane,65,66,72
2,Suzie,60,86,78
3,Billy,71,76,77
4,Ada,94,66,61
5,John,70,74,87
6,Thomas,82,99,82
7,Marie,73,71,63
8,Albert,78,67,63
9,Richard,96,61,71


In [16]:
#returning column names to lower

df.columns = [col.lower() for col in df.columns]

In [17]:
df

Unnamed: 0,name,math,english,reading
0,Sally,63,90,80
1,Jane,65,66,72
2,Suzie,60,86,78
3,Billy,71,76,77
4,Ada,94,66,61
5,John,70,74,87
6,Thomas,82,99,82
7,Marie,73,71,63
8,Albert,78,67,63
9,Richard,96,61,71


In [18]:
#see multiple columns
#use a list that contains the column names we want
#list inside brackets

#columns - name and math
df[['name', 'math']]

Unnamed: 0,name,math
0,Sally,63
1,Jane,65
2,Suzie,60
3,Billy,71
4,Ada,94
5,John,70
6,Thomas,82
7,Marie,73
8,Albert,78
9,Richard,96


In [28]:
#select one column

math_scores = df['math']

In [29]:
type(math_scores)

pandas.core.series.Series

In [30]:
#reading scores as a dataframe
#use double brackets to retain a single column as a dataframe

reading_scores = df[['reading']]

In [31]:
type(reading_scores)

pandas.core.frame.DataFrame

In [32]:
#create a variable that contains column names

columns = ['name', 'math']

In [33]:
#use variable columns to specify the columns you want

df[columns]

Unnamed: 0,name,math
0,Sally,63
1,Jane,65
2,Suzie,60
3,Billy,71
4,Ada,94
5,John,70
6,Thomas,82
7,Marie,73
8,Albert,78
9,Richard,96


In [36]:
#accessing individual columns
#returns a series
#use double brackets for dataframe

df.math

0     63
1     65
2     60
3     71
4     94
5     70
6     82
7     73
8     78
9     96
10    75
11    87
Name: math, dtype: int64

In [37]:
#returns a series
#use double brackets for a dataframe

df['math']

0     63
1     65
2     60
3     71
4     94
5     70
6     82
7     73
8     78
9     96
10    75
11    87
Name: math, dtype: int64

In [38]:
#accessing row subsets
#default is 5 rows, but can be specificed with the ()

df.head()

Unnamed: 0,name,math,english,reading
0,Sally,63,90,80
1,Jane,65,66,72
2,Suzie,60,86,78
3,Billy,71,76,77
4,Ada,94,66,61


In [40]:
#last three
#as with head, default is 5 but can be specified within ()

df.tail()

Unnamed: 0,name,math,english,reading
7,Marie,73,71,63
8,Albert,78,67,63
9,Richard,96,61,71
10,Isaac,75,97,81
11,Alan,87,85,85


In [45]:
#random sample of rows
#as with above, but defaults with 1

df.sample(1)

Unnamed: 0,name,math,english,reading
9,Richard,96,61,71


## Boolean values

In [47]:
#outputs the dataframe where each result is filtered by the boolean statement
#ie, finds rows which match a condition

df[df.math < 80]

Unnamed: 0,name,math,english,reading
0,Sally,63,90,80
1,Jane,65,66,72
2,Suzie,60,86,78
3,Billy,71,76,77
5,John,70,74,87
7,Marie,73,71,63
8,Albert,78,67,63
10,Isaac,75,97,81


In [48]:
#returns a series that gives a boolean statement dependant on the statement below

df.math < 80

0      True
1      True
2      True
3      True
4     False
5      True
6     False
7      True
8      True
9     False
10     True
11    False
Name: math, dtype: bool

In [49]:
#drop a set of columns
#use a list with the column names

df.drop(columns = ['english', 'reading'])

Unnamed: 0,name,math
0,Sally,63
1,Jane,65
2,Suzie,60
3,Billy,71
4,Ada,94
5,John,70
6,Thomas,82
7,Marie,73
8,Albert,78
9,Richard,96


In [51]:
#does drop alter the original dataframe
#methods do not alter the original value

#ran df after running the drop method
df

Unnamed: 0,name,math,english,reading
0,Sally,63,90,80
1,Jane,65,66,72
2,Suzie,60,86,78
3,Billy,71,76,77
4,Ada,94,66,61
5,John,70,74,87
6,Thomas,82,99,82
7,Marie,73,71,63
8,Albert,78,67,63
9,Richard,96,61,71


In [52]:
#rename a column using original name and new name
#use dictionary structure
#df is not altered unless you overwrite the original variable(df) name with the new command

df.rename(columns = {'name': 'student'})

Unnamed: 0,student,math,english,reading
0,Sally,63,90,80
1,Jane,65,66,72
2,Suzie,60,86,78
3,Billy,71,76,77
4,Ada,94,66,61
5,John,70,74,87
6,Thomas,82,99,82
7,Marie,73,71,63
8,Albert,78,67,63
9,Richard,96,61,71


In [56]:
#drop columns and rename in one line of code

df.drop(columns = ['english', 'reading']).rename(columns = {'name':'student'})

Unnamed: 0,student,math
0,Sally,63
1,Jane,65
2,Suzie,60
3,Billy,71
4,Ada,94
5,John,70
6,Thomas,82
7,Marie,73
8,Albert,78
9,Richard,96


## Creating Columns

In [57]:
df.math >= 70

0     False
1     False
2     False
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
Name: math, dtype: bool

In [59]:
#assigning values to a column named passing_math

df['passing_math'] = df.math >= 70

In [60]:
df

Unnamed: 0,name,math,english,reading,passing_math
0,Sally,63,90,80,False
1,Jane,65,66,72,False
2,Suzie,60,86,78,False
3,Billy,71,76,77,True
4,Ada,94,66,61,True
5,John,70,74,87,True
6,Thomas,82,99,82,True
7,Marie,73,71,63,True
8,Albert,78,67,63,True
9,Richard,96,61,71,True


In [61]:
#using assign to add a column
#using assign does not by default 

df.assign(passing_english = df.english >= 70)

Unnamed: 0,name,math,english,reading,passing_math,passing_english
0,Sally,63,90,80,False,True
1,Jane,65,66,72,False,False
2,Suzie,60,86,78,False,True
3,Billy,71,76,77,True,True
4,Ada,94,66,61,True,False
5,John,70,74,87,True,True
6,Thomas,82,99,82,True,True
7,Marie,73,71,63,True,True
8,Albert,78,67,63,True,False
9,Richard,96,61,71,True,False


In [63]:
# sort, deaults to sorting by ascending, can change to do descending

df.sort_values(by='english', ascending=False)

Unnamed: 0,name,math,english,reading,passing_math
6,Thomas,82,99,82,True
10,Isaac,75,97,81,True
0,Sally,63,90,80,False
2,Suzie,60,86,78,False
11,Alan,87,85,85,True
3,Billy,71,76,77,True
5,John,70,74,87,True
7,Marie,73,71,63,True
8,Albert,78,67,63,True
1,Jane,65,66,72,False


In [64]:
#multiple methods

#returns the name of the first student in a list of the students who got greater than 90 sorted by ascending
df[df.english > 90].sort_values(by='english').head(1).name

10    Isaac
Name: name, dtype: object