## Python Cheatsheet
### Python Lists, Tuples, Sets, and Dictionaries

In [1]:
## List
x = list(range(0,21,4))
x.insert(2,'in1') # Insert an element at second position
x.append('a1') # Append an element to the end
x.extend(['ex','ex']) # Extend the list by adding all the elements of another list to the end
print('Index of \'ex\' in list is:', x.index('ex')) # Return the index of the first occurence of an element (also works on tuples)
x.pop() # Return the last element of a list and remove it from the list (doesn't work for tuples and will pop a random element if used on sets)
x.remove('a1') # Remove first occurence of element 'a1' from list
del x[-1] # Delete last element from list
x.reverse() #reverse order of the list
y = x.copy() # Create a copy of the list
x = x[0:7] # Slice a list
x[4] = 5.5  # Reassign a value to a list element (can't use it to assign a new value to be added to the list, must use append there)
x.sort() # Sort list if sortable (i.e. if all elements can be compared)
minx,maxx,sumx = (min(x),max(x),sum(x))  # Assign multiple values using tuples; calcualte min, max, sum on a list (also works on tuples and sets)
print('Reverse sorted copy of list:', sorted(x,reverse=True))  # Return a sorted copy of x but don't change x itself
print('Is 16 in x?', 16 in x)  # Check if something is in a list, set or a tuple


## Sets
x = set(range(0,21,4))
y = set(range(12,25,2))
x.difference(y) # Equal to 'x-y' and gives all elements only in x (and not in y)
x.union(y) # Equal to 'x-y' and gives all elements only in x (and not in y)
x.intersection(y) # Equal to 'x-y' and gives all elements only in x (and not in y)
x.symmetric_difference(y) # Returns elements in union minus intersection (equivalent to y.symmetric_differenc(x) )
x.add('a') # Add 'a' to the set x
x.update(y,[9,7]) # Can take more than one sets, lists or other things as or
x.remove('a') # Discard element. Throw error if element does not exist
x.discard('a') # Discard element. Do not throw error if element does not exist
y.issubset(x) # Is y a subset of x. Return True or False
y.issuperset(x) # Is y a superset of x. Return True or False

## Dictionary
x = dict(zip(('MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY'),range(7)))
x.pop('MONDAY','some_value_to_return_when_key_missing (optional)')  # Remove a key (along with associated value) from the dictionary. Absence of key in dict and that of optional argument with throw error
print('Value for \'MONDAY\' key:', x.get('MONDAY','some_val'))  # Return 'some_val' with "get" method instead of 0 since MONDAY:0 key:val pair was removed.
x = {value:key for key, value in x.items()} # Dictionary comprehension
for key,val in x.items():  # Iterate over a dictionary key,value pairs
    print(key,val)

Index of 'ex' in list is: 8
Reverse sorted copy of list: [20, 16, 12, 8, 5.5, 4, 0]
Is 16 in x? True
Value for 'MONDAY' key: some_val
1 TUESDAY
2 WEDNESDAY
3 THURSDAY
4 FRIDAY
5 SATURDAY
6 SUNDAY


### Strings and Regular Expressions

In [2]:
SUFFIXES = ['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
sizekb = 29384723
sizemb = sizekb/1000000
s1 = '{1} {0[0]} is {2:.1f} {0[2]}.'.format(SUFFIXES, sizekb, sizemb)  # String formatting. Note you can call elements of a list by passing a list here. 
                                                                       # ":" marks begining of format specifier. The ".1" makes sure you show 1 decimal places and 
                                                                       # "f" specifies it to show as fixed-point number (as opposed to e for exponential)
print(s1)
print(s1.title()) # Return a new string with title case, i.e. first letter caps. Same for ".lower()" and ".upper()" works.
print(s1.find('is')) # Find and return the index of the begining of the first letter of a given substring in the sring.
s1.count('is') # Return count of a substring.
s1 = s1.replace(' ','-') # Return a new string by replacing all occurences of space by '-'
print(s1.split('-',2)) # Split using first two occurences of the specified demiliter '-' giving list of len 3. ".splitlines()" similarly splits into lines.


## Regex
import re
p1 = '^M{0,3}(XC|XL|L?X{0,3})$'
re.search(p1, 'MMLXXX')  # Search for pattern in a string and return the method
pattern = '''
    ^                   # match beginning of string
    M{0,3}              # match 0 to 3 Ms
    (XC|XL|L?X{0,3})    # match XC OR XL OR 0 to 1 L followed by 0 to 3 Xs
    $                   # match end of string
    ''' # This verbose pattern is same as the pattern p1 above
re.search(pattern, 'MMLXXX', re.VERBOSE) # To searcha  verbose pattern an extra argument must be passed - unless it is compiled with the right flags below

# Another pattern to help
phonePattern = re.compile(r'''
                # don't match beginning of string, number can start anywhere
    (\d{3})     # area code is 3 digits (e.g. '800')
    \D*         # optional separator is any number of non-digits
    (\d{3})     # trunk is 3 digits (e.g. '555')
    \D*         # optional separator
    (\d{4})     # rest of number is 4 digits (e.g. '1212')
    \D*         # optional separator
    (\d*)       # extension is optional and can be any number of digits
    $           # end of string
    ''', re.VERBOSE)

# Complete comments on: re.search, match, fullmatch, split, findall, finditer, sub, subn, escape

29384723 KB is 29.4 GB.
29384723 Kb Is 29.4 Gb.
12
['29384723', 'KB', 'is-29.4-GB.']


### Generators, Classes, and Iterators

In [3]:
# Define a fibonacci generator
def fib(max): 
    '''Fibonacci generator up to a max value.'''
    a, b = 0, 1
    while a < max:
        yield a
        a, b = b, a + b
        
fib50 = fib(50)  # Construct the generator that will generate values up to 1000 when used
print([i for i in fib50])  # Use the generator
    
    
gen1 = (i**2 for i in range(50) if i%2==1) # Generators can be created same as using list comprehensions. Here we have one that generates sqares of all odd numbers below 50
    
'''A class is the blueprint of an object that can have variables and methods defined (data attributes and method attributes of the class) by the user. 
Variables can be instance variables or class variables. Data attributes will usually overwrite method attributes so naming conventions can be valuable.
An iterator is an object with a __next__ method. Below is a custom class that implements the Fib iterator from scratch.
'''
class Fib:
    classvar1 = 0  # Example of a calss variable (not used in this class, simply here to illustrate). self.classvar1 will get you this variable.
    
    def __init__(self, max): # An init method is called when a class is instatiated. You should pass these arguments to the class when creating the instance.
        self.max = max   # Example of an instance variable

    def __iter__(self):  # This method makes this class an iterable. This method returns an iterator - something that has __next__ method, in this case self.
        self.a = 0
        self.b = 1
        return self

    def __next__(self):  # An iterator is something that implements this __next__ method. In this case self is both the iterable and iterator but doesn't have to be.
        fib = self.a
        if fib > self.max:
            raise StopIteration
        self.a, self.b = self.b, self.a + self.b
        return fib

[0, 1, 1, 2, 3, 5, 8, 13, 21, 34]


### Time and Date

In [4]:
import datetime
today = datetime.date.today()
print(datetime.date(1776,7,4),";  ",today)  # Define a date object. Similarly can define a datetime or a time object
print(datetime.date(1776,7,4) - datetime.date.today())  # Define a date object. Similarly can define a datetime or a time object
datetime.timedelta(days=1, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=0, weeks=0)  # Define timedelta. This is the object you get when you take the difference of two datetime objects
print(datetime.date.weekday(datetime.date.today())) # Print the weekday integer with Monday being 0
today_str = today.strftime('%a, %B %d, %Y OR %m-%d-%y')  # Return the date as a string in any format you like
print(today_str)
print(datetime.datetime.strptime(today_str[:20], '%a, %B %d, %Y'))  # Take a string and read date object from it based on specified format

1776-07-04 ;   2018-08-13
-88428 days, 0:00:00
0
Mon, August 13, 2018 OR 08-13-18
2018-08-13 00:00:00


### Map, Filter and Reduce

In [5]:
print(list(map(lambda x: x**2, (1,4,9))))  # Apply a function to each element and return a list. Map will also work on list of functions etc, as long as the operations are valid, any objects are fine
number_list = range(-5, 5)
print(list(filter(lambda x: x < 0, range(-5, 5)))) # Filter based on an arbitrary function
from functools import reduce
print(reduce((lambda x, y: x * y**2), [1, 2, 3, 4]))  # A very interesting function that can calculate results on a rolling basis and reduce. Here, the result comes from ((1*(2**2)) * (3**2)) * (4**2). Basically output of last calculation is the "x" value for the next calculation and the "y" value is the next value in the iterable

[1, 16, 81]
[-5, -4, -3, -2, -1]
576


### Pandas
#### Getting Information and Reading Data

In [6]:
import numpy as np, pandas as pd
df1 = pd.DataFrame({ 'A' : range(9,80,10), 
                    'B' : pd.Timestamp('20130102'), 
                    'C' : pd.Series(1,index=list(range(8)),dtype='float32'), 
                    'D' : np.random.rand(8)*10, 
                    'E' : pd.Categorical(["test","train","test","train","test","train","test","train"]), 
                    'F' : ['foo','bar','foo','baz','foo','bar','foo','baz'] })

df1.index = ('i'+str(i) for i in range(8)) # Set index to something more interesting for highlighting slicing better
df1.index.name = 'funindex'  # You can name the index
print('Original Frame:\n',df1)
print('\nShape is:\n', df1.shape)
print('\n\nInformation is:\n')
print(df1.info())

# Reading files
chipo = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv', sep='\t', header = 0, skiprows = [1,4,6,8]) # Header tells you which row is header if any. Use name to provide custom header

Original Frame:
            A          B    C         D      E    F
funindex                                          
i0         9 2013-01-02  1.0  5.269844   test  foo
i1        19 2013-01-02  1.0  7.780302  train  bar
i2        29 2013-01-02  1.0  9.499163   test  foo
i3        39 2013-01-02  1.0  6.248753  train  baz
i4        49 2013-01-02  1.0  5.977056   test  foo
i5        59 2013-01-02  1.0  2.751343  train  bar
i6        69 2013-01-02  1.0  4.701653   test  foo
i7        79 2013-01-02  1.0  7.034834  train  baz

Shape is:
 (8, 6)


Information is:

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, i0 to i7
Data columns (total 6 columns):
A    8 non-null int64
B    8 non-null datetime64[ns]
C    8 non-null float32
D    8 non-null float64
E    8 non-null category
F    8 non-null object
dtypes: category(1), datetime64[ns](1), float32(1), float64(1), int64(1), object(1)
memory usage: 456.0+ bytes
None


#### General Stats etc..

In [7]:
print('\nSeries Counts:\n', df1['E'].value_counts()) # For a pandas series, present counts for each value/category
df1.mean(axis=0) # Same works for sum, max, min etc..


Series Counts:
 train    4
test     4
Name: E, dtype: int64


A    44.000000
C     1.000000
D     6.157868
dtype: float64

#### Slicing in Pandas

In [8]:
print('\nSliced Frame:\n', df1[['A','B']][:2])  # Slice on columns and rows (order is unimportant here). For columns you have to specify a list and for rows a range with :
print('\nFiltered Frame:\n', df1[df1['E']=='test'])  # Filter for column E being 'test'
print('\nIndex based slicing:\n', df1.loc[['i0','i4'],'A':'C']) # You can either give range here or a list of indices. Full range is included here
print('\nLocation based slicing:\n', df1.iloc[[0,4],1:3]) # You can either give range here or a list of indices. Full range is included here
print('\nSelecting an setting element using at:\n', df1.at['i1','C']) # You can pull any value from the frame and set it to whatever you like. Same for the .iat that works like .iloc but for one val only


Sliced Frame:
            A          B
funindex               
i0         9 2013-01-02
i1        19 2013-01-02

Filtered Frame:
            A          B    C         D     E    F
funindex                                         
i0         9 2013-01-02  1.0  5.269844  test  foo
i2        29 2013-01-02  1.0  9.499163  test  foo
i4        49 2013-01-02  1.0  5.977056  test  foo
i6        69 2013-01-02  1.0  4.701653  test  foo

Index based slicing:
            A          B    C
funindex                    
i0         9 2013-01-02  1.0
i4        49 2013-01-02  1.0

Location based slicing:
                   B    C
funindex                
i0       2013-01-02  1.0
i4       2013-01-02  1.0

Selecting an setting element using at:
 1.0


#### Sorting, Casting etc

In [9]:
print(df1.sort_values(by=['E','D'], ascending= False, inplace = False, na_position = 'last')) # First sort on D then sort on E. So E is really sorted, and within each value of E, D is sorted.
df1['C'].astype(int) # Cast the type of a pandas column to another type.

           A          B    C         D      E    F
funindex                                          
i1        19 2013-01-02  1.0  7.780302  train  bar
i7        79 2013-01-02  1.0  7.034834  train  baz
i3        39 2013-01-02  1.0  6.248753  train  baz
i5        59 2013-01-02  1.0  2.751343  train  bar
i2        29 2013-01-02  1.0  9.499163   test  foo
i4        49 2013-01-02  1.0  5.977056   test  foo
i0         9 2013-01-02  1.0  5.269844   test  foo
i6        69 2013-01-02  1.0  4.701653   test  foo


funindex
i0    1
i1    1
i2    1
i3    1
i4    1
i5    1
i6    1
i7    1
Name: C, dtype: int32

#### Grouping in Pandas

In [14]:
gr = df1.groupby(list(map(lambda x: x//(10/4.), df1['D'])), axis=0) # Group based on quartiles
print('\nFrame Groups:\n',gr.groups)
print("\nGet group by it's name:\n", df1.groupby('F').get_group('bar'))
print(df1.groupby('E').sum())  # Quick summary statistics on groups


Frame Groups:
 {1.0: Index(['i5', 'i6'], dtype='object', name='funindex'), 2.0: Index(['i0', 'i3', 'i4', 'i7'], dtype='object', name='funindex'), 3.0: Index(['i1', 'i2'], dtype='object', name='funindex')}

Get group by it's name:
            A          B    C         D      E    F
funindex                                          
i1        19 2013-01-02  1.0  7.780302  train  bar
i5        59 2013-01-02  1.0  2.751343  train  bar
         A    C          D
E                         
test   156  4.0  25.447715
train  196  4.0  23.815232


#### Handling missing data in Pandas


In [15]:
import datetime
df2 = df1.reindex(index = ['i0', 'i5', 'i9', 'i2', 'i1', 'i3', 'i4', 'i6', 'i7', 'E'], columns = ['A', 'Z', 'B', 'C', 'D', 'E', 'F'])
df2.at['i3','E']=None
df2.at['i4','F']=None
print(df2.isna().sum())   # "notna" works the same. Get a frame of where NA is. This includes None values in "objects". Works on series as well.
'''Note: You CANNOT check np.nan by equating it to np.nan even though that's what is mostly used in NA's in pandas. 
Reason being np.nan != np.nan. Hoewever, in python None = None. Only object types will hold None though'''
print('np.nan == np.nan :', np.nan == np.nan)
print('None == None :', None == None)
df2['F'].fillna('baz',inplace=False) # Fill with a particular value
df3=df2.fillna(pd.Series([909,'zcol',datetime.date(2018,1,1),1.,4.4,'test','baz'], index=df2.columns),axis=0)  # You can also fill with any seris where index matches
df2.fillna(df2.mean(axis=0), axis=0)  # Good use case is when you can just use mean
df2.drop(columns=['Z']).dropna(axis=0)  # Drop all rows or columns that contain an NA
df2[['A','C','D']].interpolate(axis=0)   # Interpolate values in a linear fashion and fill NA's. This will not work if you have a column in there that is boject/text type or categorical
print(df2[['C','D','A','B','F']].replace(np.nan,2))  # Here it works for np.nan to replace (None will fail except on object type columns). However, a replacement for categorical should be categorical

A     2
Z    10
B     2
C     2
D     2
E     3
F     3
dtype: int64
np.nan == np.nan : False
None == None : True
            C         D     A                    B    F
funindex                                               
i0        1.0  5.269844   9.0  2013-01-02 00:00:00  foo
i5        1.0  2.751343  59.0  2013-01-02 00:00:00  bar
i9        2.0  2.000000   2.0                    2    2
i2        1.0  9.499163  29.0  2013-01-02 00:00:00  foo
i1        1.0  7.780302  19.0  2013-01-02 00:00:00  bar
i3        1.0  6.248753  39.0  2013-01-02 00:00:00  baz
i4        1.0  5.977056  49.0  2013-01-02 00:00:00    2
i6        1.0  4.701653  69.0  2013-01-02 00:00:00  foo
i7        1.0  7.034834  79.0  2013-01-02 00:00:00  baz
E         2.0  2.000000   2.0                    2    2


#### Applying functions

In [16]:
df1.applymap(lambda x: str(x)+'--Converted')  # Apply an elementwise function to the entire dataframe. If it is a mixed dataframe this will fail of course.
print(df1[['A', 'C', 'D']].apply(lambda x: x**2, axis = 0, result_type = None))  # Pass entire columns or rows to the function and then depending on the output of the lambda function you might get a series or a df or something else in the end. 
print(df1[['A', 'C', 'D']].apply(lambda x: sum(x), axis = 0, result_type = None))  # Here for example lambda function returns a tuple
print(df1[['A', 'C', 'D']].apply(lambda x: [1,2,3], axis = 0, result_type = None))  # Here for example lambda function returns a list
print(df1[['A', 'C', 'D']].apply(lambda x: 4, axis = 0, result_type = 'broadcast'))  # Here for example lambda function returns a number, same as the (sum) above. But we broadcast to original dimensions
print(df1[['A', 'C', 'D']].apply(lambda x: pd.Series((1,2,3), index =['i1','i2','i3'] ), axis = 0, result_type = None))  # Here for example lambda function returns a tuple
print(df1[['A', 'C', 'D']].apply(lambda x: x['A']-x['C'], axis = 1, result_type = None))  # Taking the difference of two elements when rows are passed using axis=1

             A    C          D
funindex                      
i0          81  1.0  27.771254
i1         361  1.0  60.533104
i2         841  1.0  90.234094
i3        1521  1.0  39.046909
i4        2401  1.0  35.725196
i5        3481  1.0   7.569888
i6        4761  1.0  22.105540
i7        6241  1.0  49.488896
A    352.000000
C      8.000000
D     49.262948
dtype: float64
A    [1, 2, 3]
C    [1, 2, 3]
D    [1, 2, 3]
dtype: object
            A    C    D
funindex               
i0        4.0  4.0  4.0
i1        4.0  4.0  4.0
i2        4.0  4.0  4.0
i3        4.0  4.0  4.0
i4        4.0  4.0  4.0
i5        4.0  4.0  4.0
i6        4.0  4.0  4.0
i7        4.0  4.0  4.0
    A  C  D
i1  1  1  1
i2  2  2  2
i3  3  3  3
funindex
i0     8.0
i1    18.0
i2    28.0
i3    38.0
i4    48.0
i5    58.0
i6    68.0
i7    78.0
dtype: float64


#### Concate, append

In [17]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'E': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']}, index=[0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                    'B': ['B4', 'B5', 'B6', 'B7'],
                    'C': ['C4', 'C5', 'C6', 'C7'],
                    'D': ['D4', 'D5', 'D6', 'D7']}, index=[4, 2, 6, 7])

df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                    'B': ['B8', 'B9', 'B10', 'B11'],
                    'C': ['C8', 'C9', 'C10', 'C11'],
                    'D': ['D8', 'D9', 'D10', 'D11']}, index=[8, 9, 10, 11])

df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],
                    'D': ['D2', 'D3', 'D6', 'D7'],
                    'F': ['F2', 'F3', 'F6', 'F7']}, index=[2, 3, 6, 7])

frames = [df1, df2, df3]
r0 = df1.append([df2, df3], sort=True, ignore_index = True) # This is exactly the same as below with joint=outer and axis=0 and not providing other info when using with dfs, but different with series
r1 = pd.concat(frames, axis=0, sort=False, join='outer', ignore_index=False, keys=['x', 'y', 'z'])  # The join = 'outer' here means all columns will be there; 'inner' would only show columns A,B,D
r2 = pd.concat(frames, axis=0, sort=False, join_axes = [pd.Index(['A','B','C','D'])], ignore_index=False, keys=['x', 'y', 'z']) # Instead of outer or inner, you can also provide specific index

s1 = pd.Series(['X0', 'X1', 'X2', 'X3'], name = 'X', index=['A', 'B', 'C', 'D'])
r3 = pd.concat([df1, pd.Series(['X0', 'X1', 'X2', 'X3'], name = 'X')], axis=1) # Series can be concatenated to a pandas dataframe as a column with it's name as column name
r4 = pd.concat([df1, s1], axis=0, sort=False) # Compare this behavior of concat with the following behavior of append
r5 = df1.append(s1, sort=False)  # The same behavir of append can of-course be obtained by doing the below so r6 is same as r5
r6 = pd.concat([df1, pd.DataFrame(s1).transpose()], axis=0, sort=False) # Compare this behavior of concat with the following behavior of append

pieces = {'x': df1, 'y': df2, 'z': df3}  # Can also use dictionary to pass key-df pairs
r7 = pd.concat(pieces, axis=0, sort=False, join='outer', ignore_index=False, keys=['x','z']) # in this case you can use the keys argument here to filter really
r8 = pd.concat(pieces, keys=['x', 'y', 'z'], levels=[['z', 'y', 'x', 'w']], names=['group_key'], sort=False) # You can always specify levels and names for a multiindex frame
print(r6,'\n',r8)

    A   B    E   D    C
0  A0  B0   C0  D0  NaN
1  A1  B1   C1  D1  NaN
2  A2  B2   C2  D2  NaN
3  A3  B3   C3  D3  NaN
X  X0  X1  NaN  X3   X2 
                 A    B    E    D    C
group_key                            
x         0    A0   B0   C0   D0  NaN
          1    A1   B1   C1   D1  NaN
          2    A2   B2   C2   D2  NaN
          3    A3   B3   C3   D3  NaN
y         4    A4   B4  NaN   D4   C4
          2    A5   B5  NaN   D5   C5
          6    A6   B6  NaN   D6   C6
          7    A7   B7  NaN   D7   C7
z         8    A8   B8  NaN   D8   C8
          9    A9   B9  NaN   D9   C9
          10  A10  B10  NaN  D10  C10
          11  A11  B11  NaN  D11  C11


  result = result.union(other)


#### Merge, Join

In [56]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], 
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                      'key2': ['K0', 'K0', 'K0', 'K0'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3'],
                      'E': [1,1,2,3]})
left.index.name = 'lidx'
m1 = pd.merge(left, right, on=['key1', 'key2'], validate = 'one_to_many', sort=False)  # If the key names are same you can use it to do joins
# If your index is named, the left_on and right_on arguments can contain named indices
m2  = pd.merge(left, right, left_on=['lidx', 'key2', 'key1'], right_on=['E', 'key1', 'key2'], how='outer', indicator=True)
m3 = pd.merge(right, right, left_index = True, right_on=['E'], how='outer') # Self Merge

In [57]:
print(m1)

  key1 key2   A   B   C   D  E
0   K0   K0  A0  B0  C0  D0  1
1   K1   K0  A2  B2  C1  D1  1
2   K1   K0  A2  B2  C2  D2  2
