## Importing data with genfromtxt 

### genfromtxt()

- genfromtxt function is used to create arrays from tabular data.
- In a nutshell, genfromtxt runs two main loops. The first loop converts each line of the file in a sequence of strings. The second loop converts each string to the appropriate data type.

### StringIO Module

- In Python, the StringIO module provides an in-memory file-like object called StringIO, which allows you to treat strings as file-like objects.

In [1]:
import numpy as np;
from io import StringIO;

In [2]:
data="1,3,5,7\n 9,11,13,15\n 17,19,21,23";

In [3]:
np.genfromtxt(StringIO(data),delimiter=",")

array([[ 1.,  3.,  5.,  7.],
       [ 9., 11., 13., 15.],
       [17., 19., 21., 23.]])

In [4]:
data_arr=np.genfromtxt(StringIO(data),delimiter=',')

In [5]:
data_arr

array([[ 1.,  3.,  5.,  7.],
       [ 9., 11., 13., 15.],
       [17., 19., 21., 23.]])

In [6]:
# read from string where each column having fexed width

In [7]:
data1="  534  342  456  854 4634\n48545  345  346   54    3";

In [8]:
data_arr1=np.genfromtxt(StringIO(data1),delimiter=5)

In [9]:
data_arr1

array([[5.3400e+02, 3.4200e+02, 4.5600e+02, 8.5400e+02, 4.6340e+03],
       [4.8545e+04, 3.4500e+02, 3.4600e+02, 5.4000e+01, 3.0000e+00]])

In [10]:
data_arr1.dtype

dtype('float64')

In [11]:
data_arr1[0,0]

534.0

In [12]:
# checking that what happen if we use heterogeneous string of data 

In [13]:
data3="abbas  20  99\n akram  88  30\n wahid  21  90";
data_arr3=np.genfromtxt(StringIO(data3),delimiter=(6,3,4));

In [14]:
data_arr3                                # nan stand for "Not a Number";

array([[nan, 20., 99.],
       [nan,  8., nan],
       [nan,  2., nan]])

In [15]:
data_arr3.dtype

dtype('float64')

### Create array from file data

In [16]:
file_name="text_files\local_file.txt";
file_data=np.genfromtxt(file_name,delimiter=',');

In [17]:
file_data

array([[ 434.,  664.,   74., 7845.],
       [ 455., 6453., 3434., 3434.],
       [ 534.,  634.,  634.,  677.],
       [ 534.,  645.,  754.,  845.]])

### The autostrip argument

In [18]:
data=u"1, abc , 3\n 4, xxx, 4";
np.genfromtxt(StringIO(data),delimiter=',',dtype="|U5")

array([['1', ' abc ', ' 3'],
       ['4', ' xxx', ' 4']], dtype='<U5')

In [19]:
np.genfromtxt(StringIO(data),delimiter=',',dtype="|U5",autostrip=True)

array([['1', 'abc', '3'],
       ['4', 'xxx', '4']], dtype='<U5')

In [20]:
data=u"  abbas khan ,  23,  345";                                ## |U15 : U mean unicode and 5 mean number of character
data=StringIO(data);
np.genfromtxt(data,delimiter=',',dtype='|U15')

array(['abbas khan ', '  23', '  345'], dtype='<U15')

In [21]:
data=StringIO(u"   abbas , 43,   345")
np.genfromtxt(data,delimiter=',',dtype='|U15',autostrip=True)

array(['abbas', '43', '345'], dtype='<U15')

### The comment argument

In [22]:
data=u"""
#Starting of data
#here is the fisrt line
234,5455,6456
634,6754,6344   #this is the second line
34,634,63434
346654,34,634
# this line left empty
233,545,6434
#the end
"""

In [23]:
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io,delimiter=',', dtype=np.int32, comments='#')

In [24]:
data_arr

array([[   234,   5455,   6456],
       [   634,   6754,   6344],
       [    34,    634,  63434],
       [346654,     34,    634],
       [   233,    545,   6434]])

In [25]:
data_arr.dtype

dtype('int32')

In [26]:
## Using a different character for comments

In [27]:
data=u"""
@Starting of data
@here is the fisrt line
234,5455,6456
634,6754,6344   @this is the second line
34,634,63434
346654,34,634
@this line left empty
233,545,6434
@the end
"""

In [28]:
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', dtype=np.int32, comments='@')

In [29]:
data_arr

array([[   234,   5455,   6456],
       [   634,   6754,   6344],
       [    34,    634,  63434],
       [346654,     34,    634],
       [   233,    545,   6434]])

### the skip_header and skip_footer argument

In [30]:
data=u'\n'.join(str(i) for i in range(10));
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, skip_header=3, skip_footer=5)
data_arr

array([3., 4.])

### The usecols argument

- to select a few columns from data

In [31]:
data=u"12,34,5,6,45,34\n  545,34,534,645,34,64\n  34,654,234,654,34,54";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', usecols=(1,3,5));
data_arr

array([[ 34.,   6.,  34.],
       [ 34., 645.,  64.],
       [654., 654.,  54.]])

In [32]:
data=u"12,34,5,6,45,34\n  545,34,534,645,34,64\n  34,654,234,654,34,54";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', usecols=(-1,0,4));
data_arr

array([[ 34.,  12.,  45.],
       [ 64., 545.,  34.],
       [ 54.,  34.,  34.]])

In [33]:
data=u"12,34,5,6,45,34\n  545,34,534,645,34,64\n  34,654,234,654,34,54";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', names="a,b,c,d,e", usecols=('c','a','e'));
data_arr

array([(  5.,  12., 45.), (534., 545., 34.), (234.,  34., 34.)],
      dtype=[('c', '<f8'), ('a', '<f8'), ('e', '<f8')])

### Choosing the data type

- dtype=float,   default for genfromtxt, and will generate a 2D array
- dtype=(int,float,float)   1D array
- dtype="i4,f8,|U3"    1D arry
- A dictionary with two keys 'names' and 'formats'
- dtype=[('A',int),('B',float)]  like Field accress topic
- dtype=None      each column is determined iteratively from the data itself

#### Example 1

In [34]:
data=u"232-34535-2454\n543-346-346\n634-246-234";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter='-', dtype=float);
data_arr

array([[  232., 34535.,  2454.],
       [  543.,   346.,   346.],
       [  634.,   246.,   234.]])

#### Example 2

In [35]:
data=u"232-34535-2454\n543-346-346\n634-246-234";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter='-', dtype=(int,float,float));
data_arr

array([(232, 34535., 2454.), (543,   346.,  346.), (634,   246.,  234.)],
      dtype=[('f0', '<i4'), ('f1', '<f8'), ('f2', '<f8')])

In [36]:
data_arr.shape

(3,)

In [37]:
print(data_arr)

[(232, 34535., 2454.) (543,   346.,  346.) (634,   246.,  234.)]


In [38]:
data_arr[0]

(232, 34535., 2454.)

In [39]:
data_arr[0].dtype

dtype([('f0', '<i4'), ('f1', '<f8'), ('f2', '<f8')])

In [40]:
data_arr[0][1]

34535.0

#### Example 3

In [41]:
data=u"abbas khan,23,534\n akram,23,434\n usman,35,634";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', dtype="|U10,i4,f8");
data_arr

array([('abbas khan', 23, 534.), ('akram', 23, 434.), ('usman', 35, 634.)],
      dtype=[('f0', '<U10'), ('f1', '<i4'), ('f2', '<f8')])

#### Example 5      Giving Name to Each column

In [42]:
data=u"abbas khan,23,534\n akram,23,434\n usman,35,634";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', dtype=[("Name",'|U15'),("age",int),("marks",float)]);
data_arr

array([('abbas khan', 23, 534.), ('akram', 23, 434.), ('usman', 35, 634.)],
      dtype=[('Name', '<U15'), ('age', '<i4'), ('marks', '<f8')])

In [43]:
data_arr['Name']

array(['abbas khan', 'akram', 'usman'], dtype='<U15')

In [44]:
data_arr['age']

array([23, 23, 35])

In [45]:
data_arr['marks']

array([534., 434., 634.])

#### Example 6                    None

In [46]:
data=u"53.4,23,534\n344.4,23,434\n453.,35,634";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', dtype=None);
data_arr

array([( 53.4, 23, 534), (344.4, 23, 434), (453. , 35, 634)],
      dtype=[('f0', '<f8'), ('f1', '<i4'), ('f2', '<i4')])

### The names argument

- First approach to allocate name to each column

In [47]:
data=u"1,23,45\n 534,634,634\n 534,634,634";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', dtype=[(x,int) for x in "abc"]);
data_arr

array([(  1,  23,  45), (534, 634, 634), (534, 634, 634)],
      dtype=[('a', '<i4'), ('b', '<i4'), ('c', '<i4')])

In [48]:
data_arr['a']

array([  1, 534, 534])

- Second Possiblity to use the name keyword with a sequence of string or a comma-separated string

In [49]:
data=u"1,23,45\n 534,634,634\n 534,634,634";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', names="A,B,C");
data_arr

array([(  1.,  23.,  45.), (534., 634., 634.), (534., 634., 634.)],
      dtype=[('A', '<f8'), ('B', '<f8'), ('C', '<f8')])

In [50]:
data_arr['B']

array([ 23., 634., 634.])

- if we assign True to names argument the genfromtxt function will automatically assign names to each column from the first row of the file.

In [51]:
# here we will read data from text_files\emp_file.txt file to make clear this point

In [53]:
file_name='text_files\emp_file.txt';
data_type=(int,'|U8','|U10',int,'|U15',float,int,int)
emp_data=np.genfromtxt(file_name, dtype=data_type, comments='\\', names=True);
emp_data

array([(7369, 'SMITH', 'CLERK', 7902, '17-DEC-80',  800.,   -1, 20),
       (7499, 'ALLEN', 'SALESMAN', 7698, '20-FEB-81', 1600.,  300, 30),
       (7521, 'WARD', 'SALESMAN', 7698, '22-FEB-81', 1250.,  500, 30),
       (7566, 'JONES', 'MANAGER', 7839, '02-APR-81', 2975.,   -1, 20),
       (7654, 'MARTIN', 'SALESMAN', 7698, '28-SEP-81', 1250., 1400, 30),
       (7698, 'BLAKE', 'MANAGER', 7839, '01-MAY-81', 2850.,   -1, 30),
       (7782, 'CLARK', 'MANAGER', 7839, '09-JUN-81', 2450.,   -1, 10),
       (7788, 'SCOTT', 'ANALYST', 7566, '19-APR-87', 3000.,   -1, 20),
       (7839, 'KING', 'PRESIDENT', 7844, '17-NOV-81', 5000.,   -1, 10),
       (7844, 'TURNER', 'SALESMAN', 7698, '08-SEP-81', 1500.,    0, 30),
       (7876, 'ADAMS', 'CLERK', 7788, '23-MAY-87', 1100.,   -1, 20),
       (7900, 'JAMES', 'CLERK', 7698, '03-DEC-81',  950.,   -1, 30),
       (7902, 'FORD', 'ANALYST', 7566, '03-DEC-81', 3000.,   -1, 20),
       (7934, 'MILLER', 'CLERK', 7782, '23-JAN-82', 1300.,   -1, 10)],
      d

In [55]:
emp_data['ENAME']

array(['SMITH', 'ALLEN', 'WARD', 'JONES', 'MARTIN', 'BLAKE', 'CLARK',
       'SCOTT', 'KING', 'TURNER', 'ADAMS', 'JAMES', 'FORD', 'MILLER'],
      dtype='<U8')

### The defaultfmt argument

- If names=None but a structured dtype is expected, names are defined with the standard NumPy default of "f%i", yielding names like f0, f1 and so forth:

In [None]:
data="23,5423,634,534\n 34,634,45,645\n 53,34,63,23";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', dtype=(int,float,int,float))
data_arr

- We can overwrite this default with the defaultfmt argument, that takes any format string: 

In [None]:
data="23,5423,634,534\n 34,634,45,645\n 53,34,63,23";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', dtype=(int,float,int,float), defaultfmt="V_%02i")
data_arr

### The converters argument

- The converters argument in genfromtxt() allows you to specify functions that are used to convert specific columns of the input text file to desired data types or values during the loading process.

In [None]:
file_name='text_files\converter.txt';
name="A,B,P1,P2";
data_arr=np.genfromtxt(file_name, delimiter=',', names=name);
data_arr

In [None]:
# because the file contain data with percentage symble and the default dtype is float so in output array the data is missing.

In [None]:
# solution

In [None]:
convert=lambda x: float(x.strip(b'%'))/100.0;
file_name='text_files\converter.txt';
name="A,B,P1,P2";
data_arr=np.genfromtxt(file_name, delimiter=',', names=name, converters={x:convert for x in range(2,4)});
data_arr

In [30]:
convert=lambda x: float(x.strip(b'%'))/100.0;
file_name='text_files\converter.txt';
name="A,B,P1,P2";
data_arr=np.genfromtxt(file_name, delimiter=',', names=name, converters={'P1':convert,'P2':convert});
data_arr

array([(434., 534., 0.5 , 0.64), (653., 634., 0.2 , 0.6 ),
       (447., 854., 0.45, 0.88), (487., 585., 0.34, 0.74)],
      dtype=[('A', '<f8'), ('B', '<f8'), ('P1', '<f8'), ('P2', '<f8')])

### Using Missing an Filling values

In [None]:
# by default messing values are determined from the expected data type,    float:nan,  int:-1,   string:'???'

data=",,";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', dtype=(int,float,'|U5'));
data_arr

#### missing_values

In [None]:
# We can use our own format to racognize missing values

In [None]:
data=u"?, 5,???\n 4,??,89";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, dtype=int, delimiter=',', missing_values=('?','??','???'))
data_arr

In [None]:
data=u"?, 5,???\n 4,??,89";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, dtype="i4,f8,|U5", delimiter=',', missing_values=('?','??','???'))
data_arr

In [None]:
data=u"?,?,35443,7345,?????   \n34,745,???,????,789   \n345,45,???,????,345";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', dtype="i4,f8,|U7,i4,f8", missing_values=('?','??','???','????','?????'),autostrip=True);
print(data_arr)

#### filling_values

In [None]:
data=u"?, 5,???\n 4,??,89";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, dtype=int, delimiter=',', missing_values=('?','??','???'), filling_values=-99)
data_arr

In [None]:
data=u"?, 5,???\n 4,??,89";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, dtype=int, delimiter=',', missing_values=('?','??','???'), filling_values=(-9,-99,-999))
data_arr

In [None]:
data=u"?, 5,???\n 4,??,89";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, dtype=int, delimiter=',', names=("first",'second','third'), missing_values={0:'?',1:'??',2:'???'}, filling_values={0:-9,1:-99,2:-999})
data_arr

In [None]:
data=u"?, 5,???\n 4,??,89";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, dtype=int, delimiter=',', names=("first",'second','third'), missing_values={'second':'??',2:'???','first':'?'}, filling_values={'first':-9,'third':-999,'second':-99})
data_arr

In [None]:
data=u"?,?,35443,7345,?????   \n34,745,???,????,789   \n345,45,???,????,345";
data_io=StringIO(data);
data_arr=np.genfromtxt(data_io, delimiter=',', dtype="i4,f8,|U7,i4,f8", missing_values=('?','??','???','????','?????'), autostrip=True, filling_values={0:0,1:0.0,2:'?',3:0,4:0.0});
print(data_arr)

In [1]:
import numpy as np;

In [24]:
file_name='text_files\local_file.txt';
data=np.genfromtxt(file_name, delimiter=',');
data

array([[ 434.,  664.,   74., 7845.],
       [ 455., 6453., 3434., 3434.],
       [ 534.,  634.,  634.,  677.],
       [ 534.,  645.,  754.,  845.]])

In [6]:
np.genfromtxt(file_name, delimiter=',',  usecols=(0,2))

array([[ 434.,   74.],
       [ 455., 3434.],
       [ 534.,  634.],
       [ 534.,  754.]])

In [12]:
np.genfromtxt(file_name, delimiter=',', skip_footer=1, skip_header=1)

array([[ 455., 6453., 3434., 3434.],
       [ 534.,  634.,  634.,  677.]])

In [15]:
np.genfromtxt(file_name, delimiter=',', autostrip=True, comments='#')

array([[ 434.,  664.,   74., 7845.],
       [ 455., 6453., 3434., 3434.],
       [ 534.,  634.,  634.,  677.],
       [ 534.,  645.,  754.,  845.]])

In [16]:
np.genfromtxt(file_name, delimiter=',', dtype="e,f,d,g")

array([(434.,  664.,   74., 7845.), (455., 6453., 3434., 3434.),
       (534.,  634.,  634.,  677.), (534.,  645.,  754.,  845.)],
      dtype=[('f0', '<f2'), ('f1', '<f4'), ('f2', '<f8'), ('f3', '<f8')])

In [17]:
np.genfromtxt(file_name, delimiter=',', dtype=(np.half,np.single,np.double,np.longdouble))

array([(434.,  664.,   74., 7845.), (455., 6453., 3434., 3434.),
       (534.,  634.,  634.,  677.), (534.,  645.,  754.,  845.)],
      dtype=[('f0', '<f2'), ('f1', '<f4'), ('f2', '<f8'), ('f3', '<f8')])

In [18]:
np.genfromtxt(file_name, delimiter=',', dtype=[('a','b'),('b','h'),('c','i'),('d','q')])

array([(-78,  664,   74, 7845), (-57, 6453, 3434, 3434),
       ( 22,  634,  634,  677), ( 22,  645,  754,  845)],
      dtype=[('a', 'i1'), ('b', '<i2'), ('c', '<i4'), ('d', '<i8')])

In [19]:
name='A,B,C,D';
np.genfromtxt(file_name, delimiter=',', dtype="b,h,i,q", names=name)

array([(-78,  664,   74, 7845), (-57, 6453, 3434, 3434),
       ( 22,  634,  634,  677), ( 22,  645,  754,  845)],
      dtype=[('A', 'i1'), ('B', '<i2'), ('C', '<i4'), ('D', '<i8')])

In [22]:
np.genfromtxt(file_name, delimiter=',', dtype="e,f,d,g", names=True)

array([(455., 6453., 3434., 3434.), (534.,  634.,  634.,  677.),
       (534.,  645.,  754.,  845.)],
      dtype=[('434', '<f2'), ('664', '<f4'), ('74', '<f8'), ('7845', '<f8')])

In [23]:
data[0]

array([ 434.,  664.,   74., 7845.])