-
Notifications
You must be signed in to change notification settings - Fork 0
/
util.py
144 lines (116 loc) · 3.88 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import numpy as np
from pathlib import Path
import spotipy
import json
import pandas as pd
from typing import Dict, List, Union
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.spatial.distance import yule
from scipy.cluster.hierarchy import (
fcluster, dendrogram, linkage, cut_tree, leaders)
from sklearn.cluster import OPTICS
import matplotlib.pyplot as plt
from IPython.display import display
import re
from itertools import chain
from collections import Counter
from statistics import mode
def json_list2dict(d:Dict)->Dict:
"""
Loop through all fields, and once it meet a list, it convert into a dict.
The converted output contains the index of the list as a key.
Conversion is done deeply to last level.
Parameters
----------
d : Dict
initial dict to convert
Returns
-------
Dict
converted dict
"""
for key, val in d.items():
# convert list 2 dict with key as the index if it contains a container
if isinstance(val, list) \
and len(val) > 0 \
and isinstance(val[0], (list, dict)):
val = {str(k):v for k, v in enumerate(val)}
# recursion (even for the newly converted list)
if isinstance(val, dict):
val = json_list2dict(val)
d[key] = val
return d
def normalize_request(_request)->pd.DataFrame:
"""
transform the output of a request into a DataFrame
Parameters
----------
request : Dict?
result of a request
Returns
-------
pd.DataFrame
transformed result of the request which contained nested dictionnary
"""
# some request gives back a strange dict with key the name of the
# request and values the lists output
if isinstance(_request, dict) and 'items' in _request.keys():
request = _request['items']
elif isinstance(_request, dict) \
and len(_request.keys()) == 1 \
and isinstance(_request[list(_request.keys())[0]], list):
request = _request[list(_request.keys())[0]]
else:
request = _request
# if there is multilple request inside the request (like a list). The
# output is a list, else is a dict
if isinstance(request, list):
df_list = [pd.json_normalize(json_list2dict(r)) for r in request]
df = pd.concat(df_list).reset_index()
elif isinstance(request, dict):
df = pd.json_normalize(json_list2dict(request))
return df
def _enrich_by_feature(ser:pd.Series, f, w:int)->pd.DataFrame:
"""
Helper function to retrieve the enriched data for enrich_df_by_feature
Parameters
----------
ser : pd.Series
Initial Series to use for enrichment
w : int
Size of the rolling window (to request multiple rows at a time)
f : function
Function to use to enrich the data
Returns
-------
pd.DataFrame
Enriched DataFrame
"""
window_groups = [x // w for x in range(len(ser))]
dfe = ser.groupby(window_groups)\
.apply(lambda x: normalize_request(f(x)))\
.set_index(ser)
return dfe
def enrich_df_by_feature(df:pd.DataFrame, col:str, f, w:int)->pd.DataFrame:
"""
Enrich the dataframe by requesting information
The request is done via a function which is called with a rolling window.
Use the following command to join your initial DataFrame with the enriched
Parameters
----------
df : pd.DataFrame
DataFrame to be enriched
col : str
Initial column to use for enrichment
w : int
Size of the rolling window (to request multiple rows at a time)
f : function
Function to use to enrich the data
Returns
-------
pd.DataFrame
[description]
"""
df_enriched = _enrich_by_feature(df[col], f=f, w=w)
df_enriched = df_enriched.add_prefix(f'{col}.')
return df.join(df_enriched, on=col)