Add parsing of .gen files (fixes #2)

samirelanduk · Apr 30, 2017 · d15b158 · d15b158
1 parent 32176c4
commit d15b158
Show file tree

Hide file tree

Showing 5 changed files with 429 additions and 61 deletions.
diff --git a/cdprocessing/functions.py b/cdprocessing/functions.py
@@ -11,8 +11,11 @@ def extract_all_series(django_file):
     raw_lines = list(django_file)
     file_lines = [line.decode().strip() for line in raw_lines if line.strip()]
     float_groups = get_float_groups(file_lines)
-    correct_float_groups = filter_float_groups(float_groups)
-    return correct_float_groups
+    filtered_groups = remove_short_float_groups(float_groups)
+    filtered_groups = remove_incorrect_wavelengths(filtered_groups)
+    filtered_groups = remove_short_lines(filtered_groups)
+    stripped_groups = strip_float_groups(filtered_groups)
+    return stripped_groups
 
 
 def get_float_groups(file_lines):
@@ -33,40 +36,82 @@ def get_float_groups(file_lines):
     return float_groups
 
 
-def filter_float_groups(float_groups):
-    """Takes a a bunch of number groups and returns only the longest ones with
-    matching wavelengths, and with more than three values"""
+def remove_short_float_groups(float_groups):
+    """Takes a list of float groups, identifies the longest one, and removes
+    groups shorter than that."""
 
     if float_groups:
-        # Remove float groups shorter than the longest float group
         longest_length = len(sorted(float_groups, key=lambda k: len(k))[-1])
-        correct_length_groups = [
-         group for group in float_groups if len(group) == longest_length
-        ]
+        return [group for group in float_groups if len(group) == longest_length]
+    return []
 
-        # Remove float groups whose first values (wavelengths) don't match the
-        # first values of the first float group
-        wavelengths = [
-         tuple([line[0] for line in group]) for group in correct_length_groups
-        ]
+
+def remove_incorrect_wavelengths(float_groups):
+    """Takes a list of float groups, identifies the wavelength set that is most
+    common (the first number on each line assumed to be wavelength) and removes
+    the groups which don't match."""
+
+    if float_groups:
+        wavelengths = [tuple([line[0] for line in group]) for group in float_groups]
         wavelengths = Counter(wavelengths)
-        most_common_wavelengths = wavelengths.most_common(1)[0][0]
-        correct_wavelength_groups = [group for group in correct_length_groups if tuple(
-         [line[0] for line in group]
-        ) == most_common_wavelengths]
-
-        # Remove float groups which don't have at least three values
-        groups_at_least_three = []
-        for group in correct_wavelength_groups:
-            add_group = True
-            for line in group:
-                if len(line) < 3:
-                    add_group = False
-            if add_group: groups_at_least_three.append(group)
-        final_groups = [[line[:3] for line in group] for group in groups_at_least_three]
-        return final_groups
-    else:
-        return []
+        correct_wavelengths = list(wavelengths.most_common(1)[0][0])
+        return [
+         g for g in float_groups if [line[0] for line in g] == correct_wavelengths
+        ]
+    return []
+
+
+def remove_short_lines(float_groups):
+    """Takes a list of float groups and removes those with fewer than three
+    values per line."""
+
+    return [g for g in float_groups if len([
+     line for line in g if len(line) >= 3
+    ]) == len(g)]
+
+
+def strip_float_groups(float_groups):
+    """Takes a list of float groups, works out which column is the error, and
+    removes every value on every line apart from wavelength, cd, cd_error.
+
+    The function will start by assuming the third column is an error column. It
+    will decide a column is not an error column if it finds any negative numbers
+    in that column, or any numbers greater than 100. It will also discard a
+    column if it is entirely zero. It will go through each subsequent column,
+    using the same criteria, until it finds one that looks ok.
+
+    If no columns match, an error of zero is used."""
+
+    if float_groups:
+        error_col = 2
+        while error_col > 0:
+            still_good = True
+            non_zero = False
+            for group in float_groups:
+                for line in group:
+                    if error_col > len(line) - 1:
+                        error_col = -1
+                        still_good = False
+                        break
+                    if line[error_col] < 0 or line[error_col] > 100:
+                        error_col += 1
+                        still_good = False
+                        break
+                    if line[error_col] != 0:
+                        non_zero = True
+                    if not still_good: break # The line is no good
+                if not still_good: break # The group is no good
+            # All groups have now been checked
+            if not non_zero:
+                error_col += 1
+                still_good = False
+            if still_good: break
+
+        groups = [[line[:2] + [
+         line[error_col] if error_col > 0 else 0
+        ] for line in g] for g in float_groups]
+        return groups
+    return []
 
 
 def average_series(series):

diff --git a/cdprocessing/tests/test_functions.py b/cdprocessing/tests/test_functions.py
@@ -1,14 +1,26 @@
-from unittest.mock import patch
+from unittest.mock import patch, Mock
 from cdtool.tests import ViewTest
-from cdprocessing.functions import extract_all_series, get_float_groups
-from cdprocessing.functions import filter_float_groups, average_series
-from cdprocessing.functions import get_file_name
+from cdprocessing.functions import *
 
 class AllSeriesExtractionFromFileTests(ViewTest):
 
     @patch("cdprocessing.functions.get_float_groups")
-    def test_extractor_passes_file_lines_to_float_extractor(self, test_get):
-        test_get.return_value = [[[100, 200, 300]], [[279, 2, 3]]]
+    @patch("cdprocessing.functions.remove_short_float_groups")
+    @patch("cdprocessing.functions.remove_incorrect_wavelengths")
+    @patch("cdprocessing.functions.remove_short_lines")
+    @patch("cdprocessing.functions.strip_float_groups")
+    def test_extractor_calls_correct_functions(self, *mocks):
+        float_groups = Mock()
+        len_filtered_float_groups = Mock()
+        wav_filtered_float_groups = Mock()
+        line_filtered_float_groups = Mock()
+        stripped_float_groups = Mock()
+        mock_strip, mock_line_filter, mock_wav_filter, mock_len_filter, mock_get = mocks
+        mock_get.return_value = float_groups
+        mock_len_filter.return_value = len_filtered_float_groups
+        mock_wav_filter.return_value = wav_filtered_float_groups
+        mock_line_filter.return_value = line_filtered_float_groups
+        mock_strip.return_value = stripped_float_groups
         series = extract_all_series(self.single_scan_file)
         stripped_lines = [
          "$MDCDATA:1:14:2:3:4:9",
@@ -18,17 +30,12 @@ def test_extractor_passes_file_lines_to_float_extractor(self, test_get):
          "278.000  -4.0  0.4  1.013  0.000  243.2  19.99",
          "277.000  12.0  0.3  1.013  0.000  243.5  19.99"
         ]
-        test_get.assert_called_with(stripped_lines)
-
-
-    @patch("cdprocessing.functions.filter_float_groups")
-    @patch("cdprocessing.functions.get_float_groups")
-    def test_extractor_filters_float_groups(self, test_get, test_filter):
-        test_get.return_value = [[[100, 200, 300]], [[279, 2, 3]]]
-        test_filter.return_value = [[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]]
-        series = extract_all_series(self.single_scan_file)
-        test_filter.assert_called_with(test_get.return_value)
-        self.assertEqual(series, test_filter.return_value)
+        mock_get.assert_called_with(stripped_lines)
+        mock_len_filter.assert_called_with(float_groups)
+        mock_wav_filter.assert_called_with(len_filtered_float_groups)
+        mock_line_filter.assert_called_with(wav_filtered_float_groups)
+        mock_strip.assert_called_with(line_filtered_float_groups)
+        self.assertIs(series, stripped_float_groups)
 
 
 
@@ -67,14 +74,14 @@ def test_float_groups_returns_nothing_if_no_float_groups(self):
 
 
 
-class FloatGroupFilterTests(ViewTest):
+class ShortFloatGroupRemovalTests(ViewTest):
 
     def test_can_filter_zero_float_groups(self):
-        self.assertEqual(filter_float_groups([]), [])
+        self.assertEqual(remove_short_float_groups([]), [])
 
 
-    def test_can_filter_out_small_float_groups(self):
-        filtered_groups = filter_float_groups([
+    def test_can_remove_short_groups(self):
+        filtered_groups = remove_short_float_groups([
          [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
          [[67.4, 45, 1], [45.6, 4, 1]],
          [[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]],
@@ -88,29 +95,140 @@ def test_can_filter_out_small_float_groups(self):
         ])
 
 
-    def test_can_filter_float_groups_whose_wavelengths_dont_match(self):
-        filtered_groups = filter_float_groups([
+
+class IncorrectWavelengthRemovalTests(ViewTest):
+
+    def test_can_filter_zero_float_groups(self):
+        self.assertEqual(remove_incorrect_wavelengths([]), [])
+
+
+    def test_can_remove_incorrect_wavelengths(self):
+        filtered_groups = remove_incorrect_wavelengths([
+         [[3, 76, 1], [4.5, 4, 1], [75.8, 34, 1]],
          [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
-         [[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]],
-         [[3, 76, 1], [4.5, 4, 1], [76.7, 34, 1]]
+         [[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
         ])
         self.assertEqual(filtered_groups, [
          [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
          [[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
         ])
 
 
-    def test_can_filter_float_groups_with_fewer_than_3_values(self):
-        filtered_groups = filter_float_groups([
+
+class ShortLineRemovalTests(ViewTest):
+
+    def test_can_filter_zero_float_groups(self):
+        self.assertEqual(remove_short_lines([]), [])
+
+
+    def test_can_remove_short_lines(self):
+        filtered_groups = remove_short_lines([
          [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
-         [[3, 74], [4.5, 1], [76.8, 4]]
+         [[3, 76], [4.5, 4], [76.8, 34]],
+         [[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
         ])
         self.assertEqual(filtered_groups, [
-         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]]
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
         ])
 
 
 
+class FloatGroupStrippingTests(ViewTest):
+
+    def test_can_strip_zero_float_groups(self):
+        self.assertEqual(strip_float_groups([]), [])
+
+
+    def test_stripping_len_3_lines_does_nothing(self):
+        stripped_groups = strip_float_groups([
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
+        ])
+        self.assertEqual(stripped_groups, [
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
+        ])
+
+
+    def test_stripping_just_uses_3_values_per_line(self):
+        stripped_groups = strip_float_groups([
+         [[3, 76, 1, 10], [4.5, 4, 1, 11], [76.8, 34, 1, 10]],
+         [[3, 76, 1, 9], [4.5, 4, 1, 3], [76.8, 34, 1, 8]],
+         [[3, 74, 1, 2], [4.5, 5, 1, 4], [76.8, 4, 1, 5]]
+        ])
+        self.assertEqual(stripped_groups, [
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
+        ])
+
+
+    def test_stripping_discards_negative_columns(self):
+        stripped_groups = strip_float_groups([
+         [[3, 76, 1, 1, 1], [4.5, 4, 1, 1, 1], [76.8, 34, 1, 1, 1]],
+         [[3, 76, 1, 1, 1], [4.5, 4, -1, 1, 1], [76.8, 34, 1, 1, 1]],
+         [[3, 74, 1, 1, 1], [4.5, 5, 1, 1, 1], [76.8, 4, 1, -1, 1]]
+        ])
+        self.assertEqual(stripped_groups, [
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
+        ])
+
+
+    def test_stripping_discards_100_columns(self):
+        stripped_groups = strip_float_groups([
+         [[3, 76, 1, 1, 1], [4.5, 4, 1, 1, 1], [76.8, 34, 1, 1, 1]],
+         [[3, 76, 1, 1, 1], [4.5, 4, 101, 1, 1], [76.8, 34, 1, 1, 1]],
+         [[3, 74, 1, 1, 1], [4.5, 5, 1, 1, 1], [76.8, 4, 1, 101, 1]]
+        ])
+        self.assertEqual(stripped_groups, [
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
+        ])
+
+
+    def test_stripping_discards_columns_that_are_entirely_zero(self):
+        stripped_groups = strip_float_groups([
+         [[3, 76, 0, 0, 1], [4.5, 4, 0, 0, 1], [76.8, 34, 0, 0, 1]],
+         [[3, 76, 0, 0, 1], [4.5, 4, 0, 0, 1], [76.8, 34, 0, 0, 1]],
+         [[3, 74, 0, 0, 1], [4.5, 5, 0, 0, 1], [76.8, 4, 0, 0, 1]]
+        ])
+        self.assertEqual(stripped_groups, [
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 76, 1], [4.5, 4, 1], [76.8, 34, 1]],
+         [[3, 74, 1], [4.5, 5, 1], [76.8, 4, 1]]
+        ])
+
+
+    def test_stripping_handles_running_out_of_columns(self):
+        stripped_groups = strip_float_groups([
+         [[3, 76, 1, 1, 1], [4.5, 4, 1, 1, 1], [76.8, 34, 1, 1, 1]],
+         [[3, 76, 1, 1, 1], [4.5, 4, 101, 1, 1], [76.8, 34, 1, 1, 1]],
+         [[3, 74, 1, 1, 1], [4.5, 5, 1, 1, 1], [76.8, 4, 1, 101, -1]]
+        ])
+        self.assertEqual(stripped_groups, [
+         [[3, 76, 0], [4.5, 4, 0], [76.8, 34, 0]],
+         [[3, 76, 0], [4.5, 4, 0], [76.8, 34, 0]],
+         [[3, 74, 0], [4.5, 5, 0], [76.8, 4, 0]]
+        ])
+
+
+    def test_stripping_handles_all_zeroes_columns(self):
+        stripped_groups = strip_float_groups([
+         [[3, 76, 0, 0, 0], [4.5, 4, 0, 0, 0], [76.8, 34, 0, 0, 0]],
+         [[3, 76, 0, 0, 0], [4.5, 4, 0, 0, 0], [76.8, 34, 0, 0, 0]],
+         [[3, 74, 0, 0, 0], [4.5, 5, 0, 0, 0], [76.8, 4, 0, 0, 0]]
+        ])
+        self.assertEqual(stripped_groups, [
+         [[3, 76, 0], [4.5, 4, 0], [76.8, 34, 0]],
+         [[3, 76, 0], [4.5, 4, 0], [76.8, 34, 0]],
+         [[3, 74, 0], [4.5, 5, 0], [76.8, 4, 0]]
+        ])
 
 
 

diff --git a/ftests/base.py b/ftests/base.py
@@ -38,6 +38,16 @@ def get_single_scan_from_file(self, file_name):
         return input_data
 
 
+    def get_single_gen_scan_from_file(self, file_name):
+        with open("ftests/test_data/" + file_name) as f:
+            lines = f.readlines()
+        lines = [l for l in lines if l[:3].isdigit()]
+        input_data = [(
+         float(l.split()[0]), float(l.split()[1]), float(l.split()[5])
+        ) for l in lines]
+        return input_data
+
+
     def check_chart_appears(self, chart_div):
         self.assertGreater(chart_div.size["width"], 10)
         self.assertGreater(chart_div.size["height"], 10)
@@ -124,4 +134,8 @@ def check_file_has_data(self, filename, data):
             output_lines = f.readlines()
         output_lines = [l for l in output_lines if l[:3].isdigit()]
         output_data = [tuple([float(c) for c in l.split()]) for l in output_lines]
-        self.assertEqual(output_data, data)
+        self.assertEqual(len(output_lines), len(data))
+        for index, line in enumerate(data):
+            self.assertEqual(len(line), len(output_data[index]))
+            for vindex, value in enumerate(line):
+                self.assertAlmostEqual(value, output_data[index][vindex], delta=0.005)