In [1]:
#a pandas series can be queried either by the index position or the index label.

import pandas as pd



students_classes = {'Alice': 'Physics', 'Jack': 'Chemistry', 'Molly': 'English', 'Sam': 'History'}

s = pd.Series(students_classes)

s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

# iloc and loc atributes in pandas

we don't use parentheses when we use iloc and loc atributes in pandas. Instead we use square brackets since they are atributes and not methods.

In [2]:
print(s.iloc[3]) #iloc is used to query by index position
print(s.loc['Jack']) #loc is used to query by index label


History
Chemistry


In [3]:
# pandas can determine whether we're intending to query by label or by position.

print(s[3]) #this code will call s.iloc[3] because it's an integer
print(s['Jack']) #this code will call s.loc['Jack'] because it's a string/object


History
Chemistry


if we have a series of integers pandas cannot know if we are refering to the index or the position of the element. In this case we use iloc to refer to the position of the element and loc to refer to the index of the element.

In [4]:
class_code = {99: 'Physics', 100: 'Chemistry', 101: 'English', 102: 'History'}

s = pd.Series(class_code)

print(s.iloc[0]) 
print(s.loc[99]) 

Physics
Physics


# Working with the data

In [5]:
# getting the average of the grades

grades = pd.Series([90, 80, 70, 60])

total = 0

for grade in grades:
    total += grade

print(total/len(grades))

75.0


# Vectorization

Vectorization is the process of applying operations to a whole array instead of applying them to each element of the array. This is a very important concept in pandas and numpy since it allows us to work with big datasets in a very efficient way.

To put it in simple terms, if we have a pandas series and we want to add 1 to each element of the series we can do it in a single line of code. This is the power of vectorization.

The computer can execute multiple operations at the same time and this is why vectorization is so efficient.

In [6]:
import numpy as np

total = np.sum(grades)

print(total/len(grades))

75.0


In [7]:
# which is faster

numbers = pd.Series(np.random.randint(0, 1000, 10000)) 

#if a want a series of random real 

numbers.head()

0    461
1    177
2    952
3    498
4    209
dtype: int32

In [8]:
len(numbers)

10000

In [9]:
%%timeit -n 100 

total = 0

for number in numbers:
    total += number

total/len(numbers)

673 µs ± 13.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%%timeit -n 100

total = np.sum(numbers)
total/len(numbers)

43.8 µs ± 4.88 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Broadcasting

In [11]:
numbers.head()

0    461
1    177
2    952
3    498
4    209
dtype: int32

In [12]:
numbers += 2
numbers.head()

0    463
1    179
2    954
3    500
4    211
dtype: int32

In [14]:
# Procedure way of iterating through a series
print(len(numbers))
# We can use the functin iteritems() to iterate through a series. This function returns a label and a value

for label, value in numbers.iteritems():
    numbers.at[label] = value + 2

numbers.head()

10000


0    467
1    183
2    958
3    504
4    215
dtype: int32

## Which of the above broadcasting procedures is faster

In [28]:
%%timeit -n 10 #it can run more than 10 times. 
s = pd.Series(np.random.randint(0, 1000, 1000))

for label, value in s.iteritems():
    s.loc[label] = value + 2  #we use loc instead of iloc because this is a series of integers

print(s.head())

0    433
1     86
2    896
3    432
4    311
dtype: int32
0    628
1     96
2    655
3     75
4    266
dtype: int32
0    953
1    560
2    610
3    484
4    908
dtype: int32
0    958
1    380
2    623
3    864
4    546
dtype: int32
0    929
1    288
2    249
3    467
4     63
dtype: int32
0    164
1    467
2    819
3    699
4    877
dtype: int32
0    976
1    548
2    722
3    975
4    432
dtype: int32
0    723
1    713
2     14
3    675
4    993
dtype: int32
0    916
1    876
2     66
3    470
4    432
dtype: int32
0    862
1    296
2    714
3    608
4    443
dtype: int32
0    987
1    185
2    627
3    255
4    269
dtype: int32
0    406
1    995
2    878
3     11
4     59
dtype: int32
0    158
1    830
2    720
3    425
4    184
dtype: int32
0     40
1    237
2    220
3     88
4    872
dtype: int32
0     749
1     732
2     937
3    1000
4     811
dtype: int32
0    335
1    741
2    766
3    770
4    391
dtype: int32
0     83
1     75
2    777
3    179
4    415
dtype: int32
0    597


In [34]:
%%timeit -n 10 -r 1 #we use -r 1 to run the code only once (10 repetitions)
s = pd.Series(np.random.randint(0, 1000, 1000))

for index, value in s.iteritems():
    s.loc[index] = value + 2  #we use loc instead of iloc because this is a series of integers

print(s.head())

0    117
1    769
2    900
3    445
4    446
dtype: int32
0    748
1    234
2    499
3    299
4     87
dtype: int32
0    178
1    947
2    445
3    229
4    306
dtype: int32
0    154
1    109
2    884
3    646
4    637
dtype: int32
0    719
1    118
2    793
3    352
4    231
dtype: int32
0    782
1    138
2    967
3    993
4    777
dtype: int32
0    915
1    938
2    769
3    336
4    697
dtype: int32
0    995
1    909
2    680
3    795
4    940
dtype: int32
0     22
1    676
2    595
3    895
4     75
dtype: int32
0    105
1    804
2    925
3    567
4    858
dtype: int32
17 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)


In [38]:
%%timeit -n 10 #-r 1 is optional if I want it to run exactly 10 times


s = pd.Series(np.random.randint(0, 1000, 1000))

s += 2

print(s.head())

0    803
1    778
2    533
3    690
4    314
dtype: int32
0     83
1    351
2    213
3    842
4    365
dtype: int32
0    734
1    979
2    792
3    532
4    731
dtype: int32
0    911
1     86
2    824
3    249
4    203
dtype: int32
0    839
1    543
2    910
3    927
4    216
dtype: int32
0    382
1    306
2    412
3    653
4    752
dtype: int32
0    924
1    130
2    426
3    112
4     34
dtype: int32
0    739
1    183
2    836
3    427
4    737
dtype: int32
0    498
1    901
2    680
3    317
4    458
dtype: int32
0    117
1    677
2    439
3    115
4    960
dtype: int32
0    291
1    201
2    642
3    836
4    838
dtype: int32
0    324
1    943
2    260
3    665
4    648
dtype: int32
0     81
1      4
2    376
3     42
4    364
dtype: int32
0    711
1    323
2    187
3    867
4    705
dtype: int32
0    794
1    725
2    465
3    748
4    295
dtype: int32
0    392
1    452
2    695
3    551
4    525
dtype: int32
0     13
1    957
2    622
3    215
4    273
dtype: int32
0    644
1    

# If the index doesn't exist a new entry is added

In [40]:
s = pd.Series([1, 2, 3])

s.loc['History'] = 102

s

0            1
1            2
2            3
History    102
dtype: int64

# When the index is not unique

In [42]:
students_classes = pd.Series({'Alice': 'Physics', 'Jack': 'Chemistry', 'Molly': 'English', 'Sam': 'History'})

students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [43]:
kelly_classes = pd.Series(['Philosophy', 'Arts', 'Math'], index=['Kelly', 'Kelly', 'Kelly'])
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [53]:
all_students_classes = students_classes.append(kelly_classes)
all_students_classes

  all_students_classes = students_classes.append(kelly_classes)


Alice       Physics
Jack      Chemistry
Molly       English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [51]:
all_students_classes = pd.concat([students_classes, kelly_classes])
all_students_classes


Alice       Physics
Jack      Chemistry
Molly       English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [55]:
# original series is not changed
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [56]:
all_students_classes.loc['Kelly']

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object