# Importing CSV without using Pandas

We can use the csv library's reader function to reads each line in a CSV file, then manually converts the lines into a list of strings, each string split by comma by default.

In [59]:
from csv import reader

def load_csv(filename):
    file = open(filename, 'r')
    lines = reader(file)
    dataset = list(lines)
    
    return dataset

In [60]:
filename = 'data/pima-indians-diabetes.data.csv'
dataset = load_csv(filename)

In [61]:
print('Loaded dataset has {0} rows and {1} columns'.format(len(dataset), len(dataset[0])))

Loaded dataset has 768 rows and 9 columns


In [62]:
dataset[0]

['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1']

### Convert String to Float

Normally, input CSV files for machine learning contain numeric values. We want to convert numeric values from their string representation in the CSV into their respective numeric type before we perform any other data processing. For this, we'll need a helper function to convert strings into float.

We'll also write a better version of load_csv() which uses 'with open()' to open and read the csv file and takes blank intermediate lines into consideration.

In [63]:
def load_csv(filename):
    dataset = list()
    
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        
        for row in csv_reader:
            if not row:
                continue
            
            dataset.append(row)
    
    return dataset

In [64]:
def convert_col_to_float(dataset, col):
    for row in dataset:
        row[col] = float(row[col].strip())

In [65]:
filename = 'data/pima-indians-diabetes.data.csv'
dataset = load_csv(filename)

In [66]:
for col in range(len(dataset[0])):
    convert_col_to_float(dataset, col)

dataset[0]

[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]

### Convert columns to categorical

Categorical data are classifications, as in the classifications of flowers in the Iris Dataset. We can represent categorical strings into numeric categories by assigning a number for each unique category, such as the following:

```
Category          Number
Iris-setosa       0
Iris-virginica    1
...               ...
...               ...
```

In [67]:
filename = 'iris.data'
dataset = load_csv('data/iris.data')
dataset[:5]

[['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'],
 ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
 ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa']]

In [68]:
def convert_col_to_categorical(dataset, col):
    raw_cat_data = [row[col] for row in dataset]
    
    # set() removes the duplicates in the column, which leaves only the unique categories
    unique_cats = set(raw_cat_data)
    
    # a dictionary will map the string category to a unique number
    lookup = dict()
    
    for i, unique in enumerate(unique_cats):
        lookup[unique] = i
        
    for row in dataset:
        row[col] = lookup[row[col]]
        
    return lookup

In [69]:
# Convert first 4 columns to float
for col in range(4):
    convert_col_to_float(dataset, col)
    
dataset[:5]

[[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
 [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
 [4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
 [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'],
 [5.0, 3.6, 1.4, 0.2, 'Iris-setosa']]

In [72]:
# Convert the 5th column into categorcal
lookup = convert_col_to_categorical(dataset, 4)

print(dataset[:5])

[[5.1, 3.5, 1.4, 0.2, 1], [4.9, 3.0, 1.4, 0.2, 1], [4.7, 3.2, 1.3, 0.2, 1], [4.6, 3.1, 1.5, 0.2, 1], [5.0, 3.6, 1.4, 0.2, 1]]
