-
Notifications
You must be signed in to change notification settings - Fork 2
/
04_tag_text_contracts.py
212 lines (153 loc) · 6.6 KB
/
04_tag_text_contracts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# Matej Badin | UHP | 2019 |
# Marian Tihanyi | IDRP | 2021 (update, improvements, port to Linux) |
# -------------------------------------------------------------------- |
# Packages needed : re, os, pandas, ast |
# -------------------------------------------------------------------- |
# Tag interesting contracts based on keywords |
# -------------------------------------------------------------------- |
import os
import re
import pandas as pd
import ast
find_txt = re.compile('.txt')
working_dir = os.getcwd()+'/contracts_text/'
extract_ID = re.compile(r'\d+')
contracts = {}
# Import clean table to extract metadata from it
DB_clean = pd.read_csv('CRZ_DB_clean.csv', delimiter='|', dtype=str)
dir_list = os.listdir(working_dir)
print("Loading filenames...")
for i, f in enumerate(dir_list):
percentage = i/len(dir_list) * 100
if not os.path.isfile(os.path.join(working_dir, f)):
if f in DB_clean['ID'].tolist():
contracts[f] = []
for c in os.listdir(working_dir + f):
if '.txt' in c:
contracts[f].append(c)
print(f"\r{round(percentage,1)}%", end="")
contracts_txt = {}
# We add only those contracts, which text is actually there:
for cid, fl in contracts.items():
contracts_txt[cid] = []
for f in fl:
if len(find_txt.findall(f)) > 0:
contracts_txt[cid].append(f)
print("\nFilenames loading and list generation completed.")
# Keywords are stored in keywords.txt provided in rows as categories separated by comma, first word is name of the category
# Script searches for keywords, keywords are prepared by lowercasing
fo = open('keywords.txt', 'r', encoding='utf-8')
lines = fo.readlines()
fo.close()
categories = []
# Import keywords from keywords.txt and prepare data structure
for line in lines:
line = line.split(',')
category_name = line[0]
keywords = []
hits = []
hits_per_category = 0
for i, item in enumerate(line):
if i > 0:
keywords.append(item.strip().casefold())
hits.append(0)
categories.append([category_name, keywords, hits, hits_per_category])
# Prepare header for export
header_metadata = ['Nazov','ID','Inner-ID','Objednavatel_ICO','Objednavatel','Objednavatel_adresa','Dodavatel_ICO','Dodavatel','Dodavatel_adresa',
'Datum_zverejnenia','Datum_podpisu','Datum_platnosti','Datum_ucinnosti','Posledna_zmena','Cena_konecna','Cena_podpisana','Rezort','Typ',
'Prilohy_IDs','Prilohy_nazvy','Prilohy_odkazy','Prilohy_velkosti','Prilohy_datum']
header_sum_cat = ['Výskyty']
header_categories = [category[0] for category in categories]
header_keywords = []
for category in categories:
header_keywords = header_keywords + category[1]
header = header_metadata + header_sum_cat + header_categories + header_keywords
row_list = []
N = len(contracts_txt)
i = 0
number_of_characters = []
# Go through all processed text files, lowercase it, for every keyword count number of occurrences
for contract, filelist in contracts_txt.items():
print('Analysing contract ID: ', contract, ', ', i + 1, 'out of', N)
text = ''
for f in filelist:
print("...filename:", f)
fo = open(working_dir + contract + '/' + f, 'r', encoding='utf-8')
lines = fo.readlines()
fo.close()
for line in lines:
text += line.casefold().replace('\n',' ')
del lines
for category in categories:
category[3] = 0
for j, keyword in enumerate(category[1]):
category[2][j] = text.count(keyword.casefold())
category[3] += category[2][j]
# Extract metadata and join it with counted hits
# row = DB_clean.loc[DB_clean['ID'] == int(extract_ID.findall(contracts[contract])[0])]
row = DB_clean.loc[DB_clean['ID'] == contract]
meta_data = [row.iloc[0, j] for j in range(1, 19)]
attachment_data = ast.literal_eval(row.iloc[0, 20])
IDs = []
names = []
links = []
sizes = []
date = ""
# The cell containing info about attachments has variable length depending on
# actual number of attachments (there may be multiple IDs, links, etc.).
# However, there is always only one date.
for m, attachment in enumerate(attachment_data):
# Save IDs and names in one step:
# The condition is following:
# If a number is saved as string, it is an ID
# Name always follows ID as next element
if type(attachment) == str and str(attachment).isnumeric():
IDs.append(attachment)
if m + 1 < len(attachment_data):
names.append(str(attachment_data[m + 1]))
# Save links (must contain https://):
if str(attachment).find("https://") >= 0:
links.append(str(attachment))
# Sizes are always stored as integer:
if type(attachment) == int:
sizes.append(str(attachment))
# Saving dates requires checking for a specific regex pattern:
if not re.search("\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", str(attachment)) is None:
date = str(attachment)
# If there are links and dates, meta data are valid:
if links and date != "":
meta_data = meta_data + [", ".join(IDs), ", ".join(names), ", ".join(links), ", ".join(sizes), date]
else:
meta_data = meta_data + ["","","","",""]
data_hits = []
for category in categories:
data_hits += category[2]
sum_data = 0
for category in categories:
sum_data += category[3]
data = meta_data + [str(sum_data)] + [category[3] for category in categories] + data_hits
row_list.append(dict((label, data[k]) for k, label in enumerate(header) if k < len(data)))
# Count number of characters in contract:
number_of_characters.append(len(text))
i += 1
# Save unranked csv table
DB_clean_tagged = pd.DataFrame(row_list, columns = header)
DB_clean_tagged.to_csv('DB_clean_text_tagged.csv', header = header, sep='|')
# Rank contracts according to number of keywords, number of characters in contract and price.
# Ranking is based on three categories listed above, in each category 10 points are distributed
# according to logarithmic scale and then added. Contracts are sorted by the rank.
# Insert new column -- number of characters
DB_clean_tagged.insert(24, 'Pocet_znakov', number_of_characters)
DB_clean_tagged = DB_clean_tagged.sort_values(by=['Výskyty','Pocet_znakov','Cena_konecna'], ascending = False)
# Sort rows by number of hits, number of characters and final prize
delete_rows = []
for index, row in DB_clean_tagged.iterrows():
# if str(row['Pozícia']).isnumeric() and str(row['Popis práce']).isnumeric():
#if float(row['Pozícia']) == 0 and float(row['Popis práce']) == 0:
# delete_rows.append(index)
# if str(row['Výskyty']).isnumeric():
if float(row['Výskyty']) == 0:
delete_rows.append(index)
print('Sorted : ', N, '| Filtered out : ', len(delete_rows))
DB_clean_tagged = DB_clean_tagged.drop(delete_rows)
DB_clean_tagged.to_csv('CRZ_DB_clean_text_tagged.csv', sep='|')