In [1]:
import warnings; warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import oxyba as ox
from importlib import reload; reload(ox);

The function `clean_german_number` tries to convert a string (or array of strings) 
with German number formatting into the usual format for decimal numbers.

| German        | Computer  |
|:-------------:|:---------:|
| `'1.234'`     | `1234`    | 
| `'1234'`      | `1234`    |
| `'1.234,56'`  | `1234.56` |
| `'1.234.560'` | `1234560` |
| `'-123'`      | `-123`    |


In German reports decimals are often seperated by ',' instead of '.'
what leads to problems when reading TXT files.
Most data processing libraries will read these numbers as 
strings because the number format is unknown. 
    
As far as I know it is not possible auto-detect German number
formats because of ambiguity.

### Convert String by String

In [2]:
x = ['-1.000,00', np.nan, '+472,79', None, '-340,50', np.inf, '29,80', "", 120.45, '90.000']

The function `clean_german_number` only accept strings.
All other data types will return `None`.

In [3]:
for s in x:
    print('{0:20s} {1:10s} vs {2:s}'.format(str(type(s)), str(s), str(ox.clean_german_number(s))) )

<class 'str'>        -1.000,00  vs -1000.00
<class 'float'>      nan        vs None
<class 'str'>        +472,79    vs 472.79
<class 'NoneType'>   None       vs None
<class 'str'>        -340,50    vs -340.50
<class 'float'>      inf        vs None
<class 'str'>        29,80      vs 29.80
<class 'str'>                   vs None
<class 'float'>      120.45     vs None
<class 'str'>        90.000     vs 90000


### list/tuple, numpy array, pandas DataFrame
The function `clean_german_number` will loop over arrays and supports

* `list` and `tuple` (will return a `list` in both cases)
* `numpy.ndarray`, and
* `pandas.DataFrame`



In [4]:
y1 = ox.clean_german_number(x)
y1

['-1000.00',
 None,
 '472.79',
 None,
 '-340.50',
 None,
 '29.80',
 None,
 None,
 '90000']

In [5]:
y2 = ox.clean_german_number(np.array(x))
y2

array(['-1000.00', None, '472.79', None, '-340.50', None, '29.80', None,
       None, '90000'], dtype=object)

In [6]:
y3 = ox.clean_german_number(pd.DataFrame(x))
y3 

Unnamed: 0,0
0,-1000.0
1,
2,472.79
3,
4,-340.5
5,
6,29.8
7,
8,
9,90000.0


### Multiple Columns
The `clean_german_number` will also process matrices, i.e. each element of the matrix.

In [7]:
mat = np.array(x).reshape((5,2))
mat

array([['-1.000,00', nan],
       ['+472,79', None],
       ['-340,50', inf],
       ['29,80', ''],
       [120.45, '90.000']], dtype=object)

In [8]:
out = ox.clean_german_number(mat)
out

array([['-1000.00', None],
       ['472.79', None],
       ['-340.50', None],
       ['29.80', None],
       [None, '90000']], dtype=object)

Same with pandas DataFrame object

In [9]:
df = pd.DataFrame(mat)
df

Unnamed: 0,0,1
0,"-1.000,00",
1,+47279,
2,-34050,inf
3,2980,
4,120.45,90.0


In [10]:
df2 = ox.clean_german_number(df)
df2

Unnamed: 0,0,1
0,-1000.0,
1,472.79,
2,-340.5,
3,29.8,
4,,90000.0


And with specific columns

In [11]:
df[[0]] = ox.clean_german_number(df[[0]])
df

Unnamed: 0,0,1
0,-1000.0,
1,472.79,
2,-340.5,inf
3,29.8,
4,,90.0
