-
Notifications
You must be signed in to change notification settings - Fork 1
/
format_data.py
93 lines (66 loc) · 1.85 KB
/
format_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
"""
format wikipedia csv, provide functions to use the csv data
"""
import re
import csv
class K:
title = 'Song'
album = 'Album debut'
songwriters = 'Songwriter(s)'
vocals = 'Lead vocal(s)'
year = 'Year'
notes = 'Notes'
ref = 'Ref(s)'
song_keys = ['title', 'album', 'songwriters', 'vocals', 'year', 'notes']
SUB_REGEX = re.compile(r'^([\w\s\(\),]+)((\[\d+\])+)')
def to_song_dict(d):
sd = {}
for k in song_keys:
sd[k] = d[getattr(K, k)]
vocals = []
for i in sd['vocals'].split('\n'):
vocals.append(
trim_braces(
SUB_REGEX.sub(r'\1', i.strip()).strip()
)
)
sd['vocals'] = ', '.join(vocals)
sd['album'] = sd['album'].strip().replace('\n', ' ')
return sd
def trim_quotes(s):
if s[0] == '"' and s[-1] == '"':
return s[1:-1]
return s
def trim_braces(s):
if s[0] == '(' and s[-1] == ')':
return s[1:-1]
return s
DATA_CSV_PATH = './data/songs.wikipedia.csv'
DATA_CSV_SRC_PATH = './data/src/songs.wikipedia.csv'
def read_songs_wikipedia():
print('Reading {}'.format(DATA_CSV_PATH))
with open(DATA_CSV_PATH, 'r') as fi:
r = csv.DictReader(fi)
for i in r:
yield i
def main():
rows = []
with open(DATA_CSV_SRC_PATH, 'r') as fi:
r = csv.DictReader(fi)
for i in r:
# strip quotes in `Song` column
i[K.title] = trim_quotes(i[K.title])
# remove `Ref(s)` column
del i[K.ref]
rows.append(i)
with open(DATA_CSV_PATH, 'w') as fo:
w = csv.DictWriter(fo, fieldnames=[
K.title, K.album, K.songwriters, K.vocals,
K.year, K.notes,
])
w.writeheader()
for row in rows:
w.writerow(row)
if __name__ == '__main__':
main()