A module is a file ending with _.py_ located in the same directory. Importing them allows you to access the functions and variables in that file.

### Datetime

In [2]:
from datetime import datetime, date, time

dt = datetime(2021, 12, 9, 11, 43, 23)
print("Day ", dt.day)
print("Hour", dt.hour)
print("Date", dt.date())

Day  9
Hour 11
Date 2021-12-09


In [3]:
print(dt)
dt

2021-12-09 11:43:23


datetime.datetime(2021, 12, 9, 11, 43, 23)

In [4]:
#format a datetime as a string
dt.strftime('%m/%d/%Y %H:%M') 

# %Y indicates 4 digit year and %y indicates 2 digit year
# %H indicates 24 hr clock, %I indicates 12 hr clock

'12/09/2021 11:43'

In [5]:
#parse string into datetime object
datetime.strptime('20211010', '%Y%m%d')

datetime.datetime(2021, 10, 10, 0, 0)

In [6]:
dt.replace(minute=33)

#datetime creates an immutable object.A new object is produced by this method

datetime.datetime(2021, 12, 9, 11, 33, 23)

In [7]:
dt # remains unchanged

datetime.datetime(2021, 12, 9, 11, 43, 23)

In [8]:
# Time difference

dt2 = dt.replace(minute=33)
delta = dt - dt2 
delta #the offset, datetime.timedelta type

datetime.timedelta(seconds=600)

### CSV

In [9]:
# We are going to be using a lot of CSV files
# though we will use Pandas mainly, it is handy to learn to use the csv module
import csv 

# magic command to maintain floating point precision up to two decimals
%precision 2 

with open('../insurance.csv') as csvfile:
    my_csv = list(csv.DictReader(csvfile))

my_csv[:2]

[{'age': '19',
  'sex': 'female',
  'bmi': '27.9',
  'children': '0',
  'smoker': 'yes',
  'region': 'southwest',
  'charges': '16884.924'},
 {'age': '18',
  'sex': 'male',
  'bmi': '33.77',
  'children': '1',
  'smoker': 'no',
  'region': 'southeast',
  'charges': '1725.5523'}]

In [10]:
print("Number of rows in my csv file: ", len(my_csv))

print("Keys in the dict read from the csv file:")
my_csv[0].keys()

Number of rows in my csv file:  1338
Keys in the dict read from the csv file:


dict_keys(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'])

In [11]:
# Let's find the unique regions in this dataset 
# A set only contains unique values
region_set = set(reg['region'] for reg in my_csv)
region_set

{'northeast', 'northwest', 'southeast', 'southwest'}

### Numpy
For fast processing of data in the form of arrays (lists/lists of lists)

In [12]:
import numpy as np 

np_array = np.array(([1, 2], [7, 8]))
print("The array\n", np_array)
print("Its dimensions", np_array.ndim)
print("Its shape", np_array.shape)

The array
 [[1 2]
 [7 8]]
Its dimensions 2
Its shape (2, 2)


In [13]:
# some common numpy methods
# notice the argument formats
%precision 2

zeros_arr = np.zeros(4)
print(zeros_arr)
ones_arr = np.ones((2, 3))
print(ones_arr)
rand_arr = np.random.rand(4, 4)
print(rand_arr)

# let's print a boolean array to pass some condition
print(rand_arr > 0.3)

# another common method is for reshaping an array
reshaped_rand_arr = rand_arr.reshape(2, 2, 4) # pass the desired dimensions
print(reshaped_rand_arr)

[0. 0. 0. 0.]
[[1. 1. 1.]
 [1. 1. 1.]]
[[0.05156446 0.29308214 0.62619342 0.2445484 ]
 [0.80775237 0.96589459 0.17878803 0.0407957 ]
 [0.29564673 0.54373987 0.94080266 0.94991135]
 [0.29232954 0.99409294 0.18052086 0.93001848]]
[[False False  True False]
 [ True  True False False]
 [False  True  True  True]
 [False  True False  True]]
[[[0.05156446 0.29308214 0.62619342 0.2445484 ]
  [0.80775237 0.96589459 0.17878803 0.0407957 ]]

 [[0.29564673 0.54373987 0.94080266 0.94991135]
  [0.29232954 0.99409294 0.18052086 0.93001848]]]


In [14]:
# a tiny thing to remember when it comes to array multiplications
array1 = np.ones((2, 2)) * 2
array2 = np.ones((2, 2)) * 3
print("Array 1\n", array1)
print("Array 2\n", array2)

print("Dot multiplication:\n", np.dot(array1, array2))
print("Elementwise multiplication:\n", array1*array2) 
# another way to perform element-wise miltiplication is 
# np.multiply(array1, array2)

Array 1
 [[2. 2.]
 [2. 2.]]
Array 2
 [[3. 3.]
 [3. 3.]]
Dot multiplication:
 [[12. 12.]
 [12. 12.]]
Elementwise multiplication:
 [[6. 6.]
 [6. 6.]]


In [15]:
# slicing an array is much like that of a list
array3 = np.array((([1, 1], [2, 2], [3, 3])))
print("Array 3\n", array3)
array3[:2, :1] # if thise slice is modified the original array will be changed too, as it's a reference

Array 3
 [[1 1]
 [2 2]
 [3 3]]


array([[1],
       [2]])

### Pandas

#### Pivot Table 
([Tutorial](https://www.kdnuggets.com/how-to-use-the-pivot_table-function-for-advanced-data-summarization-in-pandas))

In [16]:
import pandas as pd
import seaborn as sns

titanic = sns.load_dataset('titanic')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [17]:
pivot = pd.pivot_table(titanic, values='age', index='class',
                       columns='sex', aggfunc='mean', # aggfunc can be a user defined function as well
                       margins=True) 
pivot

  pivot = pd.pivot_table(titanic, values='age', index='class',


sex,female,male,All
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,34.611765,41.281386,38.233441
Second,28.722973,30.740707,29.87763
Third,21.75,26.507589,25.14062
All,27.915709,30.726645,29.699118


In [18]:
pivot = pd.pivot_table(titanic, values='fare', index='class', 
                       columns='sex', aggfunc=['mean', 'sum'])
pivot

  pivot = pd.pivot_table(titanic, values='fare', index='class',
  pivot = pd.pivot_table(titanic, values='fare', index='class',


Unnamed: 0_level_0,mean,mean,sum,sum
sex,female,male,female,male
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
First,106.125798,67.226127,9975.825,8201.5875
Second,21.970121,19.741782,1669.7292,2132.1125
Third,16.11881,12.661633,2321.1086,4393.5865


#### Conditional Formatting

In [19]:
# highlight max/min and null values
titanic.iloc[629:635].style.highlight_max(color='green', axis=0, subset=['age', 'fare']).highlight_min(
    color='red', axis=0, subset=['age', 'fare']
).highlight_null('orange')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
629,0,3,male,,0,0,7.7333,Q,Third,man,True,,Queenstown,no,True
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
631,0,3,male,51.0,0,0,7.0542,S,Third,man,True,,Southampton,no,True
632,1,1,male,32.0,0,0,30.5,C,First,man,True,B,Cherbourg,yes,True
633,0,1,male,,0,0,0.0,S,First,man,True,,Southampton,no,True
634,0,3,female,9.0,3,2,27.9,S,Third,child,False,,Southampton,no,False


In [20]:
# Applying colormaps
titanic.iloc[629:635].style.background_gradient(cmap='viridis', subset=['age'])

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
629,0,3,male,,0,0,7.7333,Q,Third,man,True,,Queenstown,no,True
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
631,0,3,male,51.0,0,0,7.0542,S,Third,man,True,,Southampton,no,True
632,1,1,male,32.0,0,0,30.5,C,First,man,True,B,Cherbourg,yes,True
633,0,1,male,,0,0,0.0,S,First,man,True,,Southampton,no,True
634,0,3,female,9.0,3,2,27.9,S,Third,child,False,,Southampton,no,False


In [27]:
# Highlight correlated columns
# df.style.apply() or df.style.applymap() for custom formatting

data = {
    'age' : [20, 45, 50],
    'salary': [60000, 120000, 118000]
}

df = pd.DataFrame(data)

corr_matrix = df.corr()

def highlight_corr(val):
    if val != 1.0 and abs(val) > 0.5: # not counting self correlation
        return 'background-color: green; text-decoration: underline'
    else:
        return ''
    
corr_matrix.style.applymap(highlight_corr)

  corr_matrix.style.applymap(highlight_corr)


Unnamed: 0,age,salary
age,1.0,0.982839
salary,0.982839,1.0


### Marshmallow

Data serialization & Validation

[Tutorial](https://www.kdnuggets.com/marshmallow-the-sweetest-python-library-for-data-serialization-and-validation)