# Introduction to Pandas

## Keywords

Series, querying a series, parallelization, drop function, renaming columns, loading csv files, inplace modification, sort by index and by value, set index, loc and iloc

In [1]:
%reset -f
import pandas as pd
import numpy as np

In [2]:
students = ['Ruslan','Masinjila',None]

In [3]:
pd.Series(students)

0       Ruslan
1    Masinjila
2         None
dtype: object

In [4]:
numbers = [1,2,3,4,5,None]

pd.Series(numbers)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    NaN
dtype: float64

In [5]:
# For floating points, the None type is NaN which means not a number
# The numbers are now floats. Pandas represents NaN as floating point numbers. 
# If the data of integers changes to floats, it is likely because some of the data is NaN
# NaN is not equivalent to None



In [6]:
np.nan == None

False

In [7]:
# Likewise, the following is false
np.nan == np.nan


False

In [8]:
# Instead, it should be tested as follows
np.isnan(np.nan)

True

In [9]:
# Series from dictionaries

student_scores = {'Alice': 'Physics',
                  'Bob' :   'Chemistry',
                  'Ruslan': 'Mathematics'

                    }

s = pd.Series(student_scores)
s

Alice         Physics
Bob         Chemistry
Ruslan    Mathematics
dtype: object

In [10]:
# Get the indices
s.index

Index(['Alice', 'Bob', 'Ruslan'], dtype='object')

In [11]:
# Series with indices

indices = ['Ruslan', 'Bob', 'Alice']
values = ['Mathematics', ' Chemisty','Physics']

pd.Series(values, index = indices)

Ruslan    Mathematics
Bob          Chemisty
Alice         Physics
dtype: object

In [12]:
# The index can be used to specify which keys from a dictionary to align with
# Example
pd.Series(student_scores, index = ['Ruslan','Jamila'])

Ruslan    Mathematics
Jamila            NaN
dtype: object

# Querying a series

In [13]:
# Querying can be done by the index position  or a label.

# Use iloc attribute for numerical querying starting at 0
# Use loc attribute to query by index label



In [14]:
# Students
students_classes = { 'Alice' : 'Physics',
                      'Jack' : 'Chemistry',
                      'Moly' : 'English',
                      'Sam'  : 'History'

                        }

s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Moly       English
Sam        History
dtype: object

In [15]:
# Fourth row using iloc
print(s.iloc[3])


# Pandas takes this as iloc
print(s[3])


# Fourth row using loc
print(s.loc['Sam'])

# Pandas reads this as loc
print(s['Sam'])

History
History
History
History


In [54]:
# If the index is a list of integers, then we need to be specific with iloc and loc
# For example
subjects= ['Physics', 'Chemistry','Maths']
codes = [100,101,102]

codes_subjects = pd.Series(subjects, index = codes)

codes_subjects

'Physics'

In [17]:
# Accessing data

grades = pd.Series([90,80,70,60])

np.sum(grades)

300

In [18]:
numbers = np.random.randint(0,1000,10000)


In [19]:
%%timeit -n 100

np.sum(numbers)/len(numbers)

12.6 µs ± 1.29 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [20]:
# Broadcasting

print(numbers) 

numbers += 2

print(numbers)

[ 62 435 162 ... 446 781 917]
[ 64 437 164 ... 448 783 919]


In [21]:
# .loc can also be used to add new index if the index does not exist.
# For example

s = pd.Series([10,20,30])
s
s.loc['History'] = 40
s

0          10
1          20
2          30
History    40
dtype: int64

In [22]:
# Students
students_classes = { 'Alice' : 'Physics',
                      'Jack' : 'Chemistry',
                      'Moly' : 'English',
                      'Sam'  : 'History'

                        }

s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Moly       English
Sam        History
dtype: object

In [23]:
kelly_classes = pd.Series(['Philosophy','Arts','Maths'],index= ['Kelly','Kelly','Kelly'])
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly         Maths
dtype: object

In [24]:
all_students_classes = s.append(kelly_classes)
all_students_classes

Alice       Physics
Jack      Chemistry
Moly        English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly         Maths
dtype: object

In [25]:
all_students_classes.loc['Kelly']

Kelly    Philosophy
Kelly          Arts
Kelly         Maths
dtype: object

# Pandas Dataframes

In [26]:
record1 = pd.Series({
                    'Name' : 'Alice',
                    'Class': 'Physics',
                    'Score': 85
                    })

record2 = pd.Series({
                    'Name' : 'Jack',
                    'Class': 'Chemistry',
                    'Score': 23
                    })

record3 = pd.Series({
                    'Name' : 'Helen',
                    'Class': 'Biology',
                    'Score': 90
                    })



# Creating a dataframe from a list of dictionaries
df = pd.DataFrame([record1,record2,record3],index= ['school1','school2','school1'])

# Accessing via .loc
df.loc['school1']


# Checking the type of the returned element
type(df.loc['school2'])

# Locking the exact cell
df.loc['school1','Class']

# Getting the names column
df['Name']


# Type of a Pandas Column is also a pandas series
type(df['Name'])

# This is a dataframe
df.loc['school1']

# But this is pandas series
df.loc['school1','Name']

# .loc allows lists of rows and indices
df.loc[:,['Name','Class']]


Unnamed: 0,Name,Class
school1,Alice,Physics
school2,Jack,Chemistry
school1,Helen,Biology


In [27]:
# Dropping rows
df.drop('school1')


# To make changes happen, the result needs to be assigned back to the dataframe
# df = df.drop('school1')

Unnamed: 0,Name,Class,Score
school2,Jack,Chemistry,23


In [28]:
# the drop function can have optional parameters
# inplace does the dropping on the dataframe directly, otherwise it returns a copy
# axis (default = 0 which means rows) indicates whether the rows or the columns are dropped

In [29]:
copy_df = df.copy()

copy_df.drop('Name', inplace = True, axis = 1)

In [30]:
# A columns can also be removed using del operator
# For example
del copy_df['Class']

copy_df

Unnamed: 0,Score
school1,85
school2,23
school1,90


In [31]:
# New columns can be created via broadcasting
df['ClassRanking'] = None
df

Unnamed: 0,Name,Class,Score,ClassRanking
school1,Alice,Physics,85,
school2,Jack,Chemistry,23,
school1,Helen,Biology,90,


# DataFrame Indexing and Loading

In [58]:
#!cat 'datasets/Admission_Predict.csv'


# Read CSV file in a DataFrame
df=pd.read_csv('datasets/Admission_Predict.csv',index_col=0)

df

# Renaming of the columns
# Make sure you check the names of the columns well using the following attribute
df.columns
df = df.rename(columns = {'LOR ': 'Letter of Recommendation', 
                          'SOP':'Statement of Purpose'
                         })

df = df.rename(mapper = str.strip, axis = 'columns')

df



In [33]:
# The method above is not reliable. What if the space was a tab, or double spaces?
# The solution is to clean with a function.

# Note that rename function makes a copy of the dataframe

In [34]:
# Get the list of coumns

cols = list(df.columns)
cols = [x.lower().strip() for x in cols]

df.columns = cols


# Missing Values

In [35]:
df = pd.read_csv('datasets/class_grades.csv')
df.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,,63.15,48.89
3,7,,,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
7,7,72.85,86.85,60.0,,56.11
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61


In [36]:
mask = df.isnull()

mask.head(10)


# dropna() is used to drop columns with NaNs
df.dropna().head(10)

# Filling NaNs with something else

df.fillna(0, inplace=True)

df.head()


Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,0.0,63.15,48.89
3,7,0.0,0.0,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89


In [37]:
# It is sometimes useful to have missing values as actually having some information

df = pd.read_csv('datasets/log.csv')

df.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,


In [38]:
# filling can be done with ffill (forward fill) or bfill (backward fill)

# In Pandas we can sort by index or by value.

df = df.set_index('time')
df = df.sort_index()




In [39]:
df.head(20)

Unnamed: 0_level_0,user,video,playback position,paused,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,,
1469974454,sue,advanced.html,24,,
1469974484,cheryl,intro.html,7,,
1469974514,cheryl,intro.html,8,,
1469974524,sue,advanced.html,25,,
1469974544,cheryl,intro.html,9,,
1469974554,sue,advanced.html,26,,
1469974574,cheryl,intro.html,10,,


In [40]:
# If you look at the index, you will note that it is not common.

# MUTLI-LEVEL INDEXING ON TIME AND USER TOGETHER and promote the username to second level index.


# Bring back the original index
df=df.reset_index()

df=df.set_index(['time','user'])




In [41]:
# Multi level indexing
df.loc[(1469974424, 'cheryl')]

video                intro.html
playback position             5
paused                    False
volume                       10
Name: (1469974424, cheryl), dtype: object

In [42]:
# Fill in the missing values


print(df)
df = df.fillna(method = 'ffill')
print(df)

                           video  playback position paused  volume
time       user                                                   
1469974424 cheryl     intro.html                  5  False    10.0
           sue     advanced.html                 23  False    10.0
1469974454 cheryl     intro.html                  6    NaN     NaN
           sue     advanced.html                 24    NaN     NaN
1469974484 cheryl     intro.html                  7    NaN     NaN
1469974514 cheryl     intro.html                  8    NaN     NaN
1469974524 sue     advanced.html                 25    NaN     NaN
1469974544 cheryl     intro.html                  9    NaN     NaN
1469974554 sue     advanced.html                 26    NaN     NaN
1469974574 cheryl     intro.html                 10    NaN     NaN
1469974604 cheryl     intro.html                 11    NaN     NaN
1469974624 sue     advanced.html                 27    NaN     NaN
1469974634 cheryl     intro.html                 12    NaN    

In [43]:
df = pd.DataFrame({
        'A' : [1,1,2,3,4],
        'B' : [3,6,3,8,9],
        'C' : ['a','b','c','d','c']})

# Replaceing values

df.replace(1,100)
df.replace([1,3],[100,300])

Unnamed: 0,A,B,C
0,100,300,a
1,100,6,b
2,2,300,c
3,300,8,d
4,4,9,c


In [44]:
# Replacement using regex

df = pd.read_csv('datasets/log.csv')

df.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,


In [45]:
df.replace(to_replace='.*.html',value='webpage',regex=True)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,webpage,5,False,10.0
1,1469974454,cheryl,webpage,6,,
2,1469974544,cheryl,webpage,9,,
3,1469974574,cheryl,webpage,10,,
4,1469977514,bob,webpage,1,,
5,1469977544,bob,webpage,1,,
6,1469977574,bob,webpage,1,,
7,1469977604,bob,webpage,1,,
8,1469974604,cheryl,webpage,11,,
9,1469974694,cheryl,webpage,14,,


# Basic Dataset cleaning

In [46]:
df = pd.read_csv('datasets/presidents.csv')

df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days"
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days"
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days"
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days"
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days"


In [47]:
#df['First'] = df['President'].replace('[\s].*','',regex=True)

#df['First']

# The above method words, but use the apply function

def splitname(row):
    row['First'] = row['President'].split(" ")[0]
    row['Last'] = row['President'].split(" ")[-1]
    
    return row

#df = df.apply(lambda row: splitname(row),axis=1)

# Or call the function without input (because row is the default input)
df = df.apply(splitname,axis=1)


df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe


In [48]:
# Drop the First and Last names

del df['First']
del df['Last']



In [49]:
pattern  = '(?P<First>^[\w]*)(?:.* )(?P<Last>[\w]*$)'

names = df['President'].str.extract(pattern).head()

df['First']=names['First']
df['Last']=names['Last']

df['Born'] = df['Born'].str.extract('([\w]{3}\s[0-9]{1,2},\s[0-9]{4})')

df

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last
0,1,George Washington,"Feb 22, 1732","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,"Oct 30, 1735","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,"Apr 13, 1743","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,"Mar 16, 1751","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe
5,6,John Quincy Adams,"Jul 11, 1767","57 years, 236 daysMar 4, 1825","61 years, 236 daysMar 4, 1829","18 years, 356 days","Feb 23, 1848","80 years, 227 days",,
6,7,Andrew Jackson,"Mar 15, 1767","61 years, 354 daysMar 4, 1829","69 years, 354 daysMar 4, 1837","8 years, 96 days","Jun 8, 1845","78 years, 85 days",,
7,8,Martin Van Buren,"Dec 5, 1782","54 years, 89 daysMar 4, 1837","58 years, 89 daysMar 4, 1841","21 years, 142 days","Jul 24, 1862","79 years, 231 days",,
8,9,William H. Harrison,"Feb 9, 1773","68 years, 23 daysMar 4, 1841","68 years, 54 days Apr 4, 1841[b]",,"Apr 4, 1841","68 years, 54 days",,
9,10,John Tyler,"Mar 29, 1790","51 years, 6 daysApr 4, 1841","54 years, 340 daysMar 4, 1845","16 years, 320 days","Jan 18, 1862","71 years, 295 days",,


In [50]:
# Convert the born column to datetime

df['Born'] = pd.to_datetime(df['Born'])

df

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last
0,1,George Washington,1732-02-22,"57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,1735-10-30,"61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,1743-04-13,"57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,1751-03-16,"57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,1758-04-28,"58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe
5,6,John Quincy Adams,1767-07-11,"57 years, 236 daysMar 4, 1825","61 years, 236 daysMar 4, 1829","18 years, 356 days","Feb 23, 1848","80 years, 227 days",,
6,7,Andrew Jackson,1767-03-15,"61 years, 354 daysMar 4, 1829","69 years, 354 daysMar 4, 1837","8 years, 96 days","Jun 8, 1845","78 years, 85 days",,
7,8,Martin Van Buren,1782-12-05,"54 years, 89 daysMar 4, 1837","58 years, 89 daysMar 4, 1841","21 years, 142 days","Jul 24, 1862","79 years, 231 days",,
8,9,William H. Harrison,1773-02-09,"68 years, 23 daysMar 4, 1841","68 years, 54 days Apr 4, 1841[b]",,"Apr 4, 1841","68 years, 54 days",,
9,10,John Tyler,1790-03-29,"51 years, 6 daysApr 4, 1841","54 years, 340 daysMar 4, 1845","16 years, 320 days","Jan 18, 1862","71 years, 295 days",,
