-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
165 lines (122 loc) · 5 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# Import pandas
import pandas as pd
import math
import json
import datefinder
from pprint import pprint
from collections import Counter
from numpyencoder import NumpyEncoder
from datetime import datetime
# Todo: occurances should not be global
def index(element):
global types
return types[element] if isinstance(element, type) else element
def is_datetime(string):
# Check whether there's any date / datetime somewhere in the string
# stricts requires there to be a year, month and day (without strict, even strings like "error" are a date somehow)
matches = datefinder.find_dates(string)
for match in matches:
# I didn't find a way to check len(matches), so this iteration solves it
return True
return False
# Given a string, what does the string probably contain?
def analyze_string_type(element, occurrences):
supported_strings = {"datetime": is_datetime}
for type in supported_strings:
if supported_strings[type](element):
if type not in occurrences:
occurrences[type] = 0
occurrences[type] += 1
return occurrences
def analyze_string_row(column_data):
occurrences = dict()
for element in column_data:
occurrences = analyze_string_type(element, occurrences)
make_dict_relative(occurrences, column_data.size)
return occurrences
def analyze_most_common(column_data):
common_elements = []
# Get inforation about the most occuring elements
c = Counter(column_data)
most_common_element = c.most_common(5) # get the nr. 5 most occuring elements
for el in most_common_element:
occurance = {"element": el[0], "occurances": el[1] / column_data.size}
common_elements.append(occurance)
return common_elements
def predict_seperator(filename):
file = open(filename, "r")
first_row = file.readline()
possible_seperators = [';', '\t', ',']
current_seperator = ''
occurances = 0
for sep in possible_seperators:
count = first_row.count(sep)
if count > occurances:
occurances = count
current_seperator = sep
return current_seperator
def make_dict_relative(dictionary, size):
for type in dictionary:
idx = index(type)
if isinstance(dictionary[idx], dict):
dictionary[idx] = make_dict_relative(dictionary[idx], size)
else:
dictionary[idx] = dictionary[idx] / column_data.size
return dictionary
def count_type_occurances(column_data):
occurrences = dict()
occurrences["empty"] = 0
global types
for type in types:
occurrences[index(type)] = 0
for element in column_data:
# float check because math.isnan() only works on real numbers
if isinstance(element, float) and math.isnan(element):
# Empty cells get converted to NaN by pandas
occurrences["empty"] += 1
occurrences[index("float")] -= 1 # Because it's actually not a float, but empty
for type in types:
if isinstance(element, type):
occurrences[index(type)] += 1
# Make the absolute values relative
occurrences = make_dict_relative(occurrences, column_data.size)
return occurrences
def export_json(filename, data):
with open(filename, 'w') as file:
json.dump(data, file, indent=4, sort_keys=True,
separators=(', ', ': '), ensure_ascii=False,
cls=NumpyEncoder)
# with open(filename, 'w') as fp:
# json.dump(data, fp)
input_file = "dwca-est_grey_seals_00-16-v1.1/event.txt"
# reading csv file
data = pd.read_csv(input_file, sep=predict_seperator(input_file))
data_info = dict() # Contains the info about each column
types = {bool: "bool", int: "int", float: "float", str: "str"}
# finding the type of each column
for column in data:
occurrences = dict()
column_data = data[column]
stats = dict()
stats["type-occurrences"] = count_type_occurances(column_data)
# There is nothing useful to say about most common nan's in an empty column
if not stats["type-occurrences"]["empty"] == 1.0:
stats["most_common"] = analyze_most_common(column_data)
if stats["type-occurrences"][index(bool)] or \
stats["type-occurrences"][index(int)] or \
stats["type-occurrences"][index(float)]:
stats["avg"] = column_data.mean()
stats["min"] = column_data.min()
stats["max"] = column_data.max()
stats["sd"] = column_data.std() # Standard deviation
if stats["type-occurrences"][index(str)]:
str_lengths = [len(el) for el in column_data]
stats["avg-length"] = 0 if len(str_lengths) == 0 else (float(sum(str_lengths)) / len(str_lengths))
stats["min-length"] = min(str_lengths)
stats["max-length"] = max(str_lengths)
stats["str-data"] = analyze_string_row(column_data)
# Add a timestamp for when the last update was
stats["timestamp"] = str(datetime.now())
data_info[column] = stats
pprint(data_info)
export_json("report.json", data_info)