Skip to content

Commit

Permalink
Add parsing of .gen files (fixes #2)
Browse files Browse the repository at this point in the history
  • Loading branch information
samirelanduk committed Apr 30, 2017
1 parent 32176c4 commit d15b158
Show file tree
Hide file tree
Showing 5 changed files with 429 additions and 61 deletions.
107 changes: 76 additions & 31 deletions cdprocessing/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@ def extract_all_series(django_file):
raw_lines = list(django_file)
file_lines = [line.decode().strip() for line in raw_lines if line.strip()]
float_groups = get_float_groups(file_lines)
correct_float_groups = filter_float_groups(float_groups)
return correct_float_groups
filtered_groups = remove_short_float_groups(float_groups)
filtered_groups = remove_incorrect_wavelengths(filtered_groups)
filtered_groups = remove_short_lines(filtered_groups)
stripped_groups = strip_float_groups(filtered_groups)
return stripped_groups


def get_float_groups(file_lines):
Expand All @@ -33,40 +36,82 @@ def get_float_groups(file_lines):
return float_groups


def filter_float_groups(float_groups):
"""Takes a a bunch of number groups and returns only the longest ones with
matching wavelengths, and with more than three values"""
def remove_short_float_groups(float_groups):
"""Takes a list of float groups, identifies the longest one, and removes
groups shorter than that."""

if float_groups:
# Remove float groups shorter than the longest float group
longest_length = len(sorted(float_groups, key=lambda k: len(k))[-1])
correct_length_groups = [
group for group in float_groups if len(group) == longest_length
]
return [group for group in float_groups if len(group) == longest_length]
return []

# Remove float groups whose first values (wavelengths) don't match the
# first values of the first float group
wavelengths = [
tuple([line[0] for line in group]) for group in correct_length_groups
]

def remove_incorrect_wavelengths(float_groups):
"""Takes a list of float groups, identifies the wavelength set that is most
common (the first number on each line assumed to be wavelength) and removes
the groups which don't match."""

if float_groups:
wavelengths = [tuple([line[0] for line in group]) for group in float_groups]
wavelengths = Counter(wavelengths)
most_common_wavelengths = wavelengths.most_common(1)[0][0]
correct_wavelength_groups = [group for group in correct_length_groups if tuple(
[line[0] for line in group]
) == most_common_wavelengths]

# Remove float groups which don't have at least three values
groups_at_least_three = []
for group in correct_wavelength_groups:
add_group = True
for line in group:
if len(line) < 3:
add_group = False
if add_group: groups_at_least_three.append(group)
final_groups = [[line[:3] for line in group] for group in groups_at_least_three]
return final_groups
else:
return []
correct_wavelengths = list(wavelengths.most_common(1)[0][0])
return [
g for g in float_groups if [line[0] for line in g] == correct_wavelengths
]
return []


def remove_short_lines(float_groups):
"""Takes a list of float groups and removes those with fewer than three
values per line."""

return [g for g in float_groups if len([
line for line in g if len(line) >= 3
]) == len(g)]


def strip_float_groups(float_groups):
"""Takes a list of float groups, works out which column is the error, and
removes every value on every line apart from wavelength, cd, cd_error.
The function will start by assuming the third column is an error column. It
will decide a column is not an error column if it finds any negative numbers
in that column, or any numbers greater than 100. It will also discard a
column if it is entirely zero. It will go through each subsequent column,
using the same criteria, until it finds one that looks ok.
If no columns match, an error of zero is used."""

if float_groups:
error_col = 2
while error_col > 0:
still_good = True
non_zero = False
for group in float_groups:
for line in group:
if error_col > len(line) - 1:
error_col = -1
still_good = False
break
if line[error_col] < 0 or line[error_col] > 100:
error_col += 1
still_good = False
break
if line[error_col] != 0:
non_zero = True
if not still_good: break # The line is no good
if not still_good: break # The group is no good
# All groups have now been checked
if not non_zero:
error_col += 1
still_good = False
if still_good: break

groups = [[line[:2] + [
line[error_col] if error_col > 0 else 0
] for line in g] for g in float_groups]
return groups
return []


def average_series(series):
Expand Down
176 changes: 147 additions & 29 deletions cdprocessing/tests/test_functions.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
from unittest.mock import patch
from unittest.mock import patch, Mock
from cdtool.tests import ViewTest
from cdprocessing.functions import extract_all_series, get_float_groups
from cdprocessing.functions import filter_float_groups, average_series
from cdprocessing.functions import get_file_name
from cdprocessing.functions import *

class AllSeriesExtractionFromFileTests(ViewTest):

@patch("cdprocessing.functions.get_float_groups")
def test_extractor_passes_file_lines_to_float_extractor(self, test_get):
test_get.return_value = [[[100, 200, 300]], [[279, 2, 3]]]
@patch("cdprocessing.functions.remove_short_float_groups")
@patch("cdprocessing.functions.remove_incorrect_wavelengths")
@patch("cdprocessing.functions.remove_short_lines")
@patch("cdprocessing.functions.strip_float_groups")
def test_extractor_calls_correct_functions(self, *mocks):
float_groups = Mock()
len_filtered_float_groups = Mock()
wav_filtered_float_groups = Mock()
line_filtered_float_groups = Mock()
stripped_float_groups = Mock()
mock_strip, mock_line_filter, mock_wav_filter, mock_len_filter, mock_get = mocks
mock_get.return_value = float_groups
mock_len_filter.return_value = len_filtered_float_groups
mock_wav_filter.return_value = wav_filtered_float_groups
mock_line_filter.return_value = line_filtered_float_groups
mock_strip.return_value = stripped_float_groups
series = extract_all_series(self.single_scan_file)
stripped_lines = [
"$MDCDATA:1:14:2:3:4:9",
Expand All @@ -18,17 +30,12 @@ def test_extractor_passes_file_lines_to_float_extractor(self, test_get):
"278.000 -4.0 0.4 1.013 0.000 243.2 19.99",
"277.000 12.0 0.3 1.013 0.000 243.5 19.99"
]
test_get.assert_called_with(stripped_lines)


@patch("cdprocessing.functions.filter_float_groups")
@patch("cdprocessing.functions.get_float_groups")
def test_extractor_filters_float_groups(self, test_get, test_filter):
test_get.return_value = [[[100, 200, 300]], [[279, 2, 3]]]
test_filter.return_value = [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]]
series = extract_all_series(self.single_scan_file)
test_filter.assert_called_with(test_get.return_value)
self.assertEqual(series, test_filter.return_value)
mock_get.assert_called_with(stripped_lines)
mock_len_filter.assert_called_with(float_groups)
mock_wav_filter.assert_called_with(len_filtered_float_groups)
mock_line_filter.assert_called_with(wav_filtered_float_groups)
mock_strip.assert_called_with(line_filtered_float_groups)
self.assertIs(series, stripped_float_groups)



Expand Down Expand Up @@ -67,14 +74,14 @@ def test_float_groups_returns_nothing_if_no_float_groups(self):



class FloatGroupFilterTests(ViewTest):
class ShortFloatGroupRemovalTests(ViewTest):

def test_can_filter_zero_float_groups(self):
self.assertEqual(filter_float_groups([]), [])
self.assertEqual(remove_short_float_groups([]), [])


def test_can_filter_out_small_float_groups(self):
filtered_groups = filter_float_groups([
def test_can_remove_short_groups(self):
filtered_groups = remove_short_float_groups([
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[67.4, 45, 1], [45.6, 4, 1]],
[[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]],
Expand All @@ -88,29 +95,140 @@ def test_can_filter_out_small_float_groups(self):
])


def test_can_filter_float_groups_whose_wavelengths_dont_match(self):
filtered_groups = filter_float_groups([

class IncorrectWavelengthRemovalTests(ViewTest):

def test_can_filter_zero_float_groups(self):
self.assertEqual(remove_incorrect_wavelengths([]), [])


def test_can_remove_incorrect_wavelengths(self):
filtered_groups = remove_incorrect_wavelengths([
[[3, 76, 1], [4.5, 4, 1], [75.8, 34, 1]],
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]],
[[3, 76, 1], [4.5, 4, 1], [76.7, 34, 1]]
[[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
])
self.assertEqual(filtered_groups, [
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
])


def test_can_filter_float_groups_with_fewer_than_3_values(self):
filtered_groups = filter_float_groups([

class ShortLineRemovalTests(ViewTest):

def test_can_filter_zero_float_groups(self):
self.assertEqual(remove_short_lines([]), [])


def test_can_remove_short_lines(self):
filtered_groups = remove_short_lines([
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 74], [4.5, 1], [76.8, 4]]
[[3, 76], [4.5, 4], [76.8, 34]],
[[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
])
self.assertEqual(filtered_groups, [
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]]
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
])



class FloatGroupStrippingTests(ViewTest):

def test_can_strip_zero_float_groups(self):
self.assertEqual(strip_float_groups([]), [])


def test_stripping_len_3_lines_does_nothing(self):
stripped_groups = strip_float_groups([
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
])
self.assertEqual(stripped_groups, [
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
])


def test_stripping_just_uses_3_values_per_line(self):
stripped_groups = strip_float_groups([
[[3, 76, 1, 10], [4.5, 4, 1, 11], [76.8, 34, 1, 10]],
[[3, 76, 1, 9], [4.5, 4, 1, 3], [76.8, 34, 1, 8]],
[[3, 74, 1, 2], [4.5, 5, 1, 4], [76.8, 4, 1, 5]]
])
self.assertEqual(stripped_groups, [
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
])


def test_stripping_discards_negative_columns(self):
stripped_groups = strip_float_groups([
[[3, 76, 1, 1, 1], [4.5, 4, 1, 1, 1], [76.8, 34, 1, 1, 1]],
[[3, 76, 1, 1, 1], [4.5, 4, -1, 1, 1], [76.8, 34, 1, 1, 1]],
[[3, 74, 1, 1, 1], [4.5, 5, 1, 1, 1], [76.8, 4, 1, -1, 1]]
])
self.assertEqual(stripped_groups, [
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
])


def test_stripping_discards_100_columns(self):
stripped_groups = strip_float_groups([
[[3, 76, 1, 1, 1], [4.5, 4, 1, 1, 1], [76.8, 34, 1, 1, 1]],
[[3, 76, 1, 1, 1], [4.5, 4, 101, 1, 1], [76.8, 34, 1, 1, 1]],
[[3, 74, 1, 1, 1], [4.5, 5, 1, 1, 1], [76.8, 4, 1, 101, 1]]
])
self.assertEqual(stripped_groups, [
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
])


def test_stripping_discards_columns_that_are_entirely_zero(self):
stripped_groups = strip_float_groups([
[[3, 76, 0, 0, 1], [4.5, 4, 0, 0, 1], [76.8, 34, 0, 0, 1]],
[[3, 76, 0, 0, 1], [4.5, 4, 0, 0, 1], [76.8, 34, 0, 0, 1]],
[[3, 74, 0, 0, 1], [4.5, 5, 0, 0, 1], [76.8, 4, 0, 0, 1]]
])
self.assertEqual(stripped_groups, [
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
[[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
])


def test_stripping_handles_running_out_of_columns(self):
stripped_groups = strip_float_groups([
[[3, 76, 1, 1, 1], [4.5, 4, 1, 1, 1], [76.8, 34, 1, 1, 1]],
[[3, 76, 1, 1, 1], [4.5, 4, 101, 1, 1], [76.8, 34, 1, 1, 1]],
[[3, 74, 1, 1, 1], [4.5, 5, 1, 1, 1], [76.8, 4, 1, 101, -1]]
])
self.assertEqual(stripped_groups, [
[[3, 76, 0], [4.5, 4, 0], [76.8, 34, 0]],
[[3, 76, 0], [4.5, 4, 0], [76.8, 34, 0]],
[[3, 74, 0], [4.5, 5, 0], [76.8, 4, 0]]
])


def test_stripping_handles_all_zeroes_columns(self):
stripped_groups = strip_float_groups([
[[3, 76, 0, 0, 0], [4.5, 4, 0, 0, 0], [76.8, 34, 0, 0, 0]],
[[3, 76, 0, 0, 0], [4.5, 4, 0, 0, 0], [76.8, 34, 0, 0, 0]],
[[3, 74, 0, 0, 0], [4.5, 5, 0, 0, 0], [76.8, 4, 0, 0, 0]]
])
self.assertEqual(stripped_groups, [
[[3, 76, 0], [4.5, 4, 0], [76.8, 34, 0]],
[[3, 76, 0], [4.5, 4, 0], [76.8, 34, 0]],
[[3, 74, 0], [4.5, 5, 0], [76.8, 4, 0]]
])



Expand Down
16 changes: 15 additions & 1 deletion ftests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ def get_single_scan_from_file(self, file_name):
return input_data


def get_single_gen_scan_from_file(self, file_name):
with open("ftests/test_data/" + file_name) as f:
lines = f.readlines()
lines = [l for l in lines if l[:3].isdigit()]
input_data = [(
float(l.split()[0]), float(l.split()[1]), float(l.split()[5])
) for l in lines]
return input_data


def check_chart_appears(self, chart_div):
self.assertGreater(chart_div.size["width"], 10)
self.assertGreater(chart_div.size["height"], 10)
Expand Down Expand Up @@ -124,4 +134,8 @@ def check_file_has_data(self, filename, data):
output_lines = f.readlines()
output_lines = [l for l in output_lines if l[:3].isdigit()]
output_data = [tuple([float(c) for c in l.split()]) for l in output_lines]
self.assertEqual(output_data, data)
self.assertEqual(len(output_lines), len(data))
for index, line in enumerate(data):
self.assertEqual(len(line), len(output_data[index]))
for vindex, value in enumerate(line):
self.assertAlmostEqual(value, output_data[index][vindex], delta=0.005)
Loading

0 comments on commit d15b158

Please sign in to comment.