# Introduction to Pandas

## Keywords

Series, querying a series, parallelization, drop function, renaming columns, loading csv files

In [262]:
%reset -f
import pandas as pd
import numpy as np

In [263]:
students = ['Ruslan','Masinjila',None]

In [264]:
pd.Series(students)

0       Ruslan
1    Masinjila
2         None
dtype: object

In [265]:
numbers = [1,2,3,4,5,None]

pd.Series(numbers)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    NaN
dtype: float64

In [266]:
# For floating points, the None type is NaN which means not a number
# The numbers are now floats. Pandas represents NaN as floating point numbers. 
# If the data of integers changes to floats, it is likely because some of the data is NaN
# NaN is not equivalent to None



In [267]:
np.nan == None

False

In [268]:
# Likewise, the following is false
np.nan == np.nan


False

In [269]:
# Instead, it should be tested as follows
np.isnan(np.nan)

True

In [270]:
# Series from dictionaries

student_scores = {'Alice': 'Physics',
                  'Bob' :   'Chemistry',
                  'Ruslan': 'Mathematics'

                    }

s = pd.Series(student_scores)
s

Alice         Physics
Bob         Chemistry
Ruslan    Mathematics
dtype: object

In [271]:
# Get the indices
s.index

Index(['Alice', 'Bob', 'Ruslan'], dtype='object')

In [272]:
# Series with indices

indices = ['Ruslan', 'Bob', 'Alice']
values = ['Mathematics', ' Chemisty','Physics']

pd.Series(values, index = indices)

Ruslan    Mathematics
Bob          Chemisty
Alice         Physics
dtype: object

In [273]:
# The index can be used to specify which keys from a dictionary to align with
# Example
pd.Series(student_scores, index = ['Ruslan','Jamila'])

Ruslan    Mathematics
Jamila            NaN
dtype: object

# Querying a series

In [274]:
# Querying can be done by the index position  or a label.

# Use iloc attribute for numerical querying starting at 0
# Use loc attribute to query by index label



In [275]:
# Students
students_classes = { 'Alice' : 'Physics',
                      'Jack' : 'Chemistry',
                      'Moly' : 'English',
                      'Sam'  : 'History'

                        }

s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Moly       English
Sam        History
dtype: object

In [276]:
# Fourth row using iloc
print(s.iloc[3])


# Pandas takes this as iloc
print(s[3])


# Fourth row using loc
print(s.loc['Sam'])

# Pandas reads this as loc
print(s['Sam'])

History
History
History
History


In [277]:
# If the index is a list of integers, then we need to be specific with iloc and loc
# For example
subjects= ['Physics', 'Chemistry','Maths']
codes = [100,101,102]

codes_subjects = pd.Series(subjects, index = codes)

codes_subjects

100      Physics
101    Chemistry
102        Maths
dtype: object

In [278]:
# Accessing data

grades = pd.Series([90,80,70,60])

np.sum(grades)

300

In [279]:
numbers = np.random.randint(0,1000,10000)


In [280]:
%%timeit -n 100

np.sum(numbers)/len(numbers)

The slowest run took 8.41 times longer than the fastest. This could mean that an intermediate result is being cached.
22 µs ± 26.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [281]:
# Broadcasting

print(numbers) 

numbers += 2

print(numbers)

[261 445 892 ... 851 181 881]
[263 447 894 ... 853 183 883]


In [282]:
# .loc can also be used to add new index if the index does not exist.
# For example

s = pd.Series([10,20,30])
s
s.loc['History'] = 40
s

0          10
1          20
2          30
History    40
dtype: int64

In [283]:
# Students
students_classes = { 'Alice' : 'Physics',
                      'Jack' : 'Chemistry',
                      'Moly' : 'English',
                      'Sam'  : 'History'

                        }

s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Moly       English
Sam        History
dtype: object

In [284]:
kelly_classes = pd.Series(['Philosophy','Arts','Maths'],index= ['Kelly','Kelly','Kelly'])
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly         Maths
dtype: object

In [285]:
all_students_classes = s.append(kelly_classes)
all_students_classes

Alice       Physics
Jack      Chemistry
Moly        English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly         Maths
dtype: object

In [286]:
all_students_classes.loc['Kelly']

Kelly    Philosophy
Kelly          Arts
Kelly         Maths
dtype: object

# Pandas Dataframes

In [287]:
record1 = pd.Series({
                    'Name' : 'Alice',
                    'Class': 'Physics',
                    'Score': 85
                    })

record2 = pd.Series({
                    'Name' : 'Jack',
                    'Class': 'Chemistry',
                    'Score': 23
                    })

record3 = pd.Series({
                    'Name' : 'Helen',
                    'Class': 'Biology',
                    'Score': 90
                    })



# Creating a dataframe from a list of dictionaries
df = pd.DataFrame([record1,record2,record3],index= ['school1','school2','school1'])

# Accessing via .loc
df.loc['school1']


# Checking the type of the returned element
type(df.loc['school2'])

# Locking the exact cell
df.loc['school1','Class']

# Getting the names column
df['Name']


# Type of a Pandas Column is also a pandas series
type(df['Name'])

# This is a dataframe
df.loc['school1']

# But this is pandas series
df.loc['school1','Name']

# .loc allows lists of rows and indices
df.loc[:,['Name','Class']]


Unnamed: 0,Name,Class
school1,Alice,Physics
school2,Jack,Chemistry
school1,Helen,Biology


In [288]:
# Dropping rows
df.drop('school1')


# To make changes happen, the result needs to be assigned back to the dataframe
# df = df.drop('school1')

Unnamed: 0,Name,Class,Score
school2,Jack,Chemistry,23


In [289]:
# the drop function can have optional parameters
# inplace does the dropping on the dataframe directly, otherwise it returns a copy
# axis (default = 0 which means rows) indicates whether the rows or the columns are dropped

In [290]:
copy_df = df.copy()

copy_df.drop('Name', inplace = True, axis = 1)

In [291]:
# A columns can also be removed using del operator
# For example
del copy_df['Class']

copy_df

Unnamed: 0,Score
school1,85
school2,23
school1,90


In [292]:
# New columns can be created via broadcasting
df['ClassRanking'] = None
df

Unnamed: 0,Name,Class,Score,ClassRanking
school1,Alice,Physics,85,
school2,Jack,Chemistry,23,
school1,Helen,Biology,90,


# DataFrame Indexing and Loading

In [293]:
#!cat 'datasets/Admission_Predict.csv'


# Read CSV file in a DataFrame
df=pd.read_csv('datasets/Admission_Predict.csv',index_col=0)

df

# Renaming of the columns
# Make sure you check the names of the columns well using the following attribute
df.columns
df = df.rename(columns = {'LOR ': 'Letter of Recommendation', 
                          'SOP':'Statement of Purpose'
                         })

df = df.rename(mapper = str.strip, axis = 'columns')

df



Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,Statement of Purpose,Letter of Recommendation,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.00,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.80
5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
396,324,110,3,3.5,3.5,9.04,1,0.82
397,325,107,3,3.0,3.5,9.11,1,0.84
398,330,116,4,5.0,4.5,9.45,1,0.91
399,312,103,3,3.5,4.0,8.78,0,0.67


In [294]:
# The method above is not reliable. What if the space was a tab, or double spaces?
# The solution is to clean with a function.

# Note that rename function makes a copy of the dataframe

In [295]:
# Get the list of coumns

cols = list(df.columns)
cols = [x.lower().strip() for x in cols]

df.columns = cols
