# Python dictionaries

In [1]:
d = {'a': 1, 'b': 2}

In [2]:
print(d)

{'a': 1, 'b': 2}


In [3]:
print(list(d))

['a', 'b']


In [4]:
d['a']

1

In [5]:
d.keys()

dict_keys(['a', 'b'])

In [6]:
# Another way to create the dictionary
d = dict(a=1, b=2)

In [7]:
d

{'a': 1, 'b': 2}

In [8]:
# Add a list as the value of 3
d.update({'c': [1, 2, 3]})

In [9]:
d

{'a': 1, 'b': 2, 'c': [1, 2, 3]}

In [10]:
import pandas as pd

In [12]:
df = pd.DataFrame(d)

In [13]:
# It duplicated a and b in the dictionar to match length of vector c
# That's how pandas behave. It does not throw an error.
df

Unnamed: 0,a,b,c
0,1,2,1
1,1,2,2
2,1,2,3


In [14]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [16]:
# stop is not included. In python, remember length is stop - start because of 0 indexing
len(df.index)


3

In [17]:
df.index[1]

1

In [21]:
df.iloc[1]

a    1
b    2
c    2
Name: 1, dtype: int64

In [22]:
# Above is the second row even though it's shown by pandas as a column
df.iloc[:2]
# will give first two rows. No number before column = start at 0

Unnamed: 0,a,b,c
0,1,2,1
1,1,2,2


In [23]:
# Default values
df.iloc[:]

Unnamed: 0,a,b,c
0,1,2,1
1,1,2,2
2,1,2,3


In [24]:
# Start, stop, step so this will give every other row
df.iloc[::2]

Unnamed: 0,a,b,c
0,1,2,1
2,1,2,3


In [25]:
# To get the same result
df.iloc[0::2]

Unnamed: 0,a,b,c
0,1,2,1
2,1,2,3


In [33]:
# can use negative indexing
df.iloc[-2:]

Unnamed: 0,a,b,c
1,1,2,2
2,1,2,3


In [30]:
# Starting with (and including) the third to last one returns everything. Remember -1 gives last one
df.iloc[-3:]

Unnamed: 0,a,b,c
0,1,2,1
1,1,2,2
2,1,2,3


In [31]:
# To make it backwards, make a negative step
df.iloc[::-1]

Unnamed: 0,a,b,c
2,1,2,3
1,1,2,2
0,1,2,1


In [32]:
# Using negative for stop will go up to, but not including, the second to last one.
# By the way, the reason python uses 0 indexing is because then -1 can refer to last element
df.iloc[:-2]

Unnamed: 0,a,b,c
0,1,2,1


In [36]:
list(range(3))

[0, 1, 2]

In [37]:
list(range(1,3))

[1, 2]

In [44]:
mylist = list(range(9))

In [45]:
mylist

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [41]:
# The reason for exclusive stop index is shown below. We can use the same reference to get two halves of the list
midpoint = 5

In [46]:
first = mylist[:midpoint] # The first 5 elements, whatever they are
second = mylist[midpoint:] # The rest

In [47]:
first, second

([0, 1, 2, 3, 4], [5, 6, 7, 8])

In [48]:
# If we wanted to do this by the actual number 50
mylist = list(range(45,55))

In [49]:
mylist

[45, 46, 47, 48, 49, 50, 51, 52, 53, 54]

In [51]:
mylist.index(50) # 5th element is 50

5

In [55]:
# By the way with slicing, if you are out of range you just get an empty list, no error
mylist[100:]

[]

In [56]:
# But if you index by something that doesn't exist, you will get an error
mylist[100]

IndexError: list index out of range

In [58]:
# import a dataset
from sklearn import datasets

In [59]:
datasets

<module 'sklearn.datasets' from '//anaconda3/lib/python3.7/site-packages/sklearn/datasets/__init__.py'>

In [62]:
b = datasets.load_boston()
b

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [63]:
type(b)

sklearn.utils.Bunch

In [66]:
# bunch type but behaves a lot like a dictionary
b.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [67]:
# These are the keys. The things we can acess in a few ways
b.data

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [68]:
b['data'] # another way

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [69]:
# Since these are equivalent
b.data == b['data']

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [71]:
# Create a dataframe with this data
b.df = pd.DataFrame(b.data, columns = b.feature_names)

In [72]:
b.df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [73]:
# b.data is a numPy array. By passing feature_names into column names, we get what the column means
type(b.data)

numpy.ndarray

In [74]:
b.data.shape

(506, 13)

In [76]:
b.data[:,1]

array([ 18. ,   0. ,   0. ,   0. ,   0. ,   0. ,  12.5,  12.5,  12.5,
        12.5,  12.5,  12.5,  12.5,   0. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,  75. ,  75. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,   0. ,   0. ,  21. ,  21. ,  21. ,  21. ,
        75. ,  90. ,  85. , 100. ,  25. ,  25. ,  25. ,  25. ,  25. ,
        25. ,  17.5,  80. ,  80. ,  12.5,  12.5,  12.5,   0. ,   0. ,
         0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,  25. ,
        25. ,  25. ,  25. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,  28. ,  28. ,  28. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,

In [77]:
b.df.iloc[:,1] # same thing

0      18.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
501     0.0
502     0.0
503     0.0
504     0.0
505     0.0
Name: ZN, Length: 506, dtype: float64

In [78]:
b.df.iloc[:,:2]

Unnamed: 0,CRIM,ZN
0,0.00632,18.0
1,0.02731,0.0
2,0.02729,0.0
3,0.03237,0.0
4,0.06905,0.0
...,...,...
501,0.06263,0.0
502,0.04527,0.0
503,0.06076,0.0
504,0.10959,0.0


a tuple is an immutable list. It cannot be changed

In [79]:
t = tuple(mylist)

In [80]:
t

(45, 46, 47, 48, 49, 50, 51, 52, 53, 54)

In [92]:
# Create a new tuple
t = 1, 2, 3 # 1st way to create
t = (1, 2, 3) # 2nd way to create

In [93]:
t

(1, 2, 3)

In [83]:
# tuples are considered safer because when working with a list, you can change it with your methods
# They cannot be changed. They can only be replaced

In [89]:
abc = tuple("aaaaaaaaaaaabbbbc")

In [90]:
# tuples only have two methods: index and count
abc.index("b")

12

In [91]:
abc.count("b")

4

In [94]:
[x for x in (1, 2, 3)]

[1, 2, 3]

In [95]:
list((1, 2, 3)) # does the same thing

[1, 2, 3]

# List comprehensions
When you want to create lists, use list comprehensions rather than for loops.

In [96]:
list(range(9))

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [97]:
[x for x in range(9)]

[0, 1, 2, 3, 4, 5, 6, 7, 8]

In [99]:
chr(98)

'b'

In [103]:
[chr(x) for x in range(65,91)] # list comprehension to grab 26 numbers, then convert them to chartacters

['A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z']

In [105]:
[chr(x).lower() for x in range(65,91)] # same thing but convert them to lower case

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [108]:
alphabet = "".join([chr(x).lower() for x in range(65,91)])

In [109]:
alphabet

'abcdefghijklmnopqrstuvwxyz'

In [110]:
alphabet = "".join(chr(x).lower() for x in range(65,91)) # without square brackets, it's a generator expression
# That means it doesn't have to generate the list along the way, it's just instructions
# No intermediates along the way

In [111]:
alphabet

'abcdefghijklmnopqrstuvwxyz'

In [112]:
# could also do it this way
gen_comp = (chr(x).lower() for x in range(65,91))
"".join(gen_comp)

'abcdefghijklmnopqrstuvwxyz'

In [116]:
# The less efficient way to do it
mylist = ([chr(x).lower() for x in range(65,91)])
"".join(mylist)

'abcdefghijklmnopqrstuvwxyz'

In [None]:
# Takeaway: if you are using a list comprehension, you can remove the brackets to make it a generator expression
# This is more efficient because there is no intermediate

# Sets 

Sets are mainly used to remove duplicates

In [121]:
abc = list("aaaaaaaabbbccc")

In [122]:
abc

['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c']

In [123]:
set(abc)

{'a', 'b', 'c'}

In [127]:
mylist = list(set(abc))

In [128]:
mylist
# set is unordered which is why I lost my order right there

['c', 'a', 'b']

In [129]:
mylist.sort()

In [130]:
mylist

['a', 'b', 'c']

In [131]:
# The equivalent to piping things in R tidyverse is method chain in python
"a b c".upper().split()

['A', 'B', 'C']

In [137]:
"a b c".upper().split().index("B") # since the result of split is a list, we can use index

1

# Method chaining in dataframes

In [138]:
df

Unnamed: 0,a,b,c
0,1,2,1
1,1,2,2
2,1,2,3


In [139]:
df.sum()

a    3
b    6
c    6
dtype: int64

In [140]:
df.sum().sum()

15

In [141]:
# .T is transpose
df.T

Unnamed: 0,0,1,2
a,1,1,1
b,2,2,2
c,1,2,3


In [142]:
# The .T attribute already contains the transposed dataframe. It is not a method

In [147]:
df

Unnamed: 0,a,b,c
0,1,2,1
1,1,2,2
2,1,2,3


In [148]:
df.sum(axis = 0) # sum by columns

a    3
b    6
c    6
dtype: int64

In [150]:
df.sum(axis = 1) # row sums

0    4
1    5
2    6
dtype: int64

# Loops

In [6]:
mylist = [] # pre allocate empty list

In [7]:
for x in "abc":
    mylist.append(x) # Loop to create a list

In [8]:
mylist

['a', 'b', 'c']

In [9]:
# To do the same thing with a list comprehension
[x for x in "abc"]

['a', 'b', 'c']

In [10]:
# The simplest way for this very simple case
list("abc")

['a', 'b', 'c']