In [1]:
#genfromtxt
#https://numpy.org/doc/stable/user/basics.io.genfromtxt.html

# NumPy provides several functions to create arrays from tabular data. We focus here on the genfromtxt function.

In a nutshell, genfromtxt runs two main loops. The first loop converts each line of the file in a sequence of strings. 
The second loop converts each string to the appropriate data type. This mechanism is slower than a single loop, 
but gives more flexibility. In particular, genfromtxt is able to take missing data into account, when other
faster and simpler functions like loadtxt cannot

In [7]:
import numpy as np 
from io import StringIO

In [3]:
data = u"1, 2, 3\n4, 5, 6"

In [8]:
np.genfromtxt(StringIO(data),delimiter=",")


array([[1., 2., 3.],
       [4., 5., 6.]])

In [12]:
data = u"  1  2  3\n  4  5 67\n890123  4"
np.genfromtxt(StringIO(data),delimiter=3)

array([[  1.,   2.,   3.],
       [  4.,   5.,  67.],
       [890., 123.,   4.]])

In [39]:
data = u"123456789\n   4  7 9\n   4567 9"
np.genfromtxt(StringIO(data),delimiter=(4,3,2))

array([[1234.,  567.,   89.],
       [   4.,    7.,    9.],
       [   4.,  567.,    9.]])

In [49]:
data = "1,1.3,abcde"
str1 = StringIO(data)
print(str1)
np.genfromtxt(str1, dtype=[('myint','i8'),('myfloat','f8'),('mystring','S5')], delimiter=",")

<_io.StringIO object at 0x000001EABB250280>


array((1, 1.3, b'abcde'),
      dtype=[('myint', '<i8'), ('myfloat', '<f8'), ('mystring', 'S5')])

### The autostrip argument

By default, when a line is decomposed into a series of strings, the individual entries are not stripped of leading nor trailing white spaces. This behavior can be overwritten by setting the optional argument autostrip to a value of True:

In [52]:
data = u"1, abc , 2\n 3, xxx, 4"
np.genfromtxt(StringIO(data),delimiter=",",dtype="|U5")


array([['1', ' abc ', ' 2'],
       ['3', ' xxx', ' 4']], dtype='<U5')

In [53]:
data = u"1, abc , 2\n 3, xxx, 4"
np.genfromtxt(StringIO(data),delimiter=",",dtype="|U5",autostrip=True)

array([['1', 'abc', '2'],
       ['3', 'xxx', '4']], dtype='<U5')

### The comments argument

The optional argument comments is used to define a character string that marks the beginning of a comment. By default, genfromtxt assumes comments='#'. The comment marker may occur anywhere on the line. Any character present after the comment marker(s) is simply ignored:

In [56]:
data = u"""#
# Skip me !
# Skip me too !
1, 2
3, 4
5, 6 #This is the third line of the data
7, 8
# And here comes the last line
9, 0
"""
np.genfromtxt(StringIO(data),comments='#',delimiter=',')

array([[1., 2.],
       [3., 4.],
       [5., 6.],
       [7., 8.],
       [9., 0.]])

In [66]:
data = data = u"1, abc , 2\n 3, xxx, 4,4444,55555,100,xxxxxx"
np.genfromtxt(StringIO(data),skip_footer=5,skip_header=5)

  np.genfromtxt(StringIO(data),skip_footer=5,skip_header=5)


array([], dtype=float64)

In [1]:
#The usecols argument
#https://numpy.org/doc/stable/user/basics.io.genfromtxt.html

In [3]:
dat = u"1 2 3\n4 5 6"
np.genfromtxt(StringIO(dat),usecols=(0,-1))

array([[1., 3.],
       [4., 6.]])

If the columns have names, we can also select which columns to import by giving their name to the usecols argument, either as a sequence of strings or a comma-separated string:

In [6]:
data = u"1 2 3\n4 5 6"
np.genfromtxt(StringIO(data),names="a,b,c",usecols=('a','c'))

array([(1., 3.), (4., 6.)], dtype=[('a', '<f8'), ('c', '<f8')])

In [8]:
np.genfromtxt(StringIO(data),names="x,y,z",usecols=("x,z"))


array([(1., 3.), (4., 6.)], dtype=[('x', '<f8'), ('z', '<f8')])

In [9]:
data =StringIO("1 2 3\n 4 5 6")
np.genfromtxt(data,dtype=[(_,int) for _ in "abc"])

array([(1, 2, 3), (4, 5, 6)],
      dtype=[('a', '<i4'), ('b', '<i4'), ('c', '<i4')])

In [12]:
data = StringIO("1 2 3\n 4 5 6")
np.genfromtxt(data,names="A,B,C",dtype=int)

array([(1, 2, 3), (4, 5, 6)],
      dtype=[('A', '<i4'), ('B', '<i4'), ('C', '<i4')])

We may sometimes need to define the column names from the data itself. In that case, we must use the names keyword with a value of True. The names will then be read from the first line (after the skip_header ones), even if the line is commented out:

In [13]:
data = StringIO("So it goes\n#a b c\n1 2 3\n 4 5 6")
np.genfromtxt(data,skip_header=1,names=True)

array([(1., 2., 3.), (4., 5., 6.)],
      dtype=[('a', '<f8'), ('b', '<f8'), ('c', '<f8')])

The default value of names is None. If we give any other value to the keyword, the new names will overwrite the field names we may have defined with the dtype:

In [14]:
data = StringIO("1 2 3\n 4 5 6")
ndtype = [('a',int),('b',float),('c',int)]
names = ["A","B","C"]
np.genfromtxt(data,names=names,dtype=ndtype)

array([(1, 2., 3), (4, 5., 6)],
      dtype=[('A', '<i4'), ('B', '<f8'), ('C', '<i4')])

In [2]:
#The defaultfmt argument

If names=None but a structured dtype is expected, names are defined with the standard NumPy default of "f%i", yielding names like f0, f1 and so forth:

In [3]:
data = "1 2 3\n 4 5 6"
dat1 = StringIO(data)
np.genfromtxt(dat1,dtype=(int,float,int))

array([(1, 2., 3), (4, 5., 6)],
      dtype=[('f0', '<i4'), ('f1', '<f8'), ('f2', '<i4')])

In [4]:
data = "1 2 3\n 4 5 6"
dat = StringIO(data)
np.genfromtxt(dat,names='a',dtype=(int,float,int))

array([(1, 2., 3), (4, 5., 6)],
      dtype=[('a', '<i4'), ('f0', '<f8'), ('f1', '<i4')])

In [6]:
#We can overwrite this default with the defaultfmt argument, that takes any format string:
np.genfromtxt(dat,dtype=(int,float,int),defaultfmt='var_%0i')

  np.genfromtxt(dat,dtype=(int,float,int),defaultfmt='var_%0i')


array([], dtype=[('var_0', '<i4'), ('var_1', '<f8'), ('var_2', '<i4')])

In [12]:
convertsec = lambda x : float(x.strip(b"%"))/100                          

In [13]:
data3 = u"1, 2.3%, 45.\n6, 78.9%, 0"
names = ("i","p","n")
np.genfromtxt(StringIO(data3),delimiter=',',names=names)

array([(1., nan, 45.), (6., nan,  0.)],
      dtype=[('i', '<f8'), ('p', '<f8'), ('n', '<f8')])

In [14]:
np.genfromtxt(StringIO(data3), names=names,delimiter=',',converters={1:convertsec})

array([(1., 0.023, 45.), (6., 0.789,  0.)],
      dtype=[('i', '<f8'), ('p', '<f8'), ('n', '<f8')])

In [16]:
#The same results can be obtained by using the name of the second column ("p") as key instead of its index (1):
np.genfromtxt(StringIO(data3),delimiter=',',names=names,converters={'p':convertsec})

array([(1., 0.023, 45.), (6., 0.789,  0.)],
      dtype=[('i', '<f8'), ('p', '<f8'), ('n', '<f8')])

Converters can also be used to provide a default for missing entries. In the following example, the converter 
convert transforms a stripped string into the corresponding float or into -999 if the string is empty. We need 
to explicitly strip the string from white spaces as it is not done by default:

In [18]:
convert1 = lambda x:float(x.strip() or -999)
data4 = u"1, , 3\n 4, 5, 6"
np.genfromtxt(StringIO(data4),converters={1:convert1},delimiter=',')

array([[   1., -999.,    3.],
       [   4.,    5.,    6.]])

#### Using missing and filling values
By default, any empty string is marked as missing. We can also consider more complex strings, such as "N/A" or "???" to represent missing or invalid data. The missing_values argument accepts three kinds of values:

a string or a comma-separated string
This string will be used as the marker for missing data for all the columns

a sequence of strings
In that case, each item is associated to a column, in order.

a dictionary
Values of the dictionary are strings or sequence of strings. The corresponding keys can be column indices (integers) or column names (strings). In addition, the special key None can be used to define a default applicable to all columns.

In [21]:
data5 = u"N/A, 2, 3\n4, ,???"
kwags = dict(delimiter=',' ,
             dtype=int,
             names="a,b,c",
             missing_values={0:'N/A','b':" ",2:"???"},
             filling_values={0:0,'b':0,2:-99})
np.genfromtxt(StringIO(data5),**kwags)

array([(0, 2,   3), (4, 0, -99)],
      dtype=[('a', '<i4'), ('b', '<i4'), ('c', '<i4')])