In [1]:
from csv import reader

In [2]:
def load_csv(filename):
    dataset = list()
    
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        
        for row in csv_reader:
            if not row:
                continue
            
            dataset.append(row)
    
    return dataset

def convert_col_to_float(dataset, col):
    for row in dataset:
        row[col] = float(row[col].strip())

In [3]:
filename = 'data/pima-indians-diabetes.data.csv'
dataset = load_csv(filename)

### Normalization

In [4]:
dataset[:4]

[['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1'],
 ['1', '85', '66', '29', '0', '26.6', '0.351', '31', '0'],
 ['8', '183', '64', '0', '0', '23.3', '0.672', '32', '1'],
 ['1', '89', '66', '23', '94', '28.1', '0.167', '21', '0']]

In [5]:
for col in range(len(dataset[0])):
    convert_col_to_float(dataset, col)

In [6]:
dataset[:4]

[[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0],
 [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0],
 [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0],
 [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 0.0]]

In [7]:
def get_min_max(dataset):
    min_max_values = list()
    
    for col in range(len(dataset[0])):
        col_values = [row[col] for row in dataset]
        
        minimum = min(col_values)
        maximum = max(col_values)
        
        min_max_values.append([minimum, maximum])
        
    return min_max_values

In [8]:
min_max_values = get_min_max(dataset)
min_max_values[:4]

[[0.0, 17.0], [0.0, 199.0], [0.0, 122.0], [0.0, 99.0]]

In [9]:
def normalize(dataset, min_max_values):
    for col in range(len(dataset[0])):
        for row in range(len(dataset)):
            dataset[row][col] = (dataset[row][col] - min_max_values[col][0]) / (min_max_values[col][1] - min_max_values[col][0])

In [10]:
normalize(dataset, min_max_values)

In [11]:
dataset[0]

[0.35294117647058826,
 0.7437185929648241,
 0.5901639344262295,
 0.35353535353535354,
 0.0,
 0.5007451564828614,
 0.23441502988898377,
 0.48333333333333334,
 1.0]

### Standardization

In [12]:
from math import sqrt

In [13]:
filename = 'data/pima-indians-diabetes.data.csv'
dataset = load_csv(filename)

for col in range(len(dataset[0])):
    convert_col_to_float(dataset, col)
    
dataset[0]

[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]

In [14]:
def get_col_means(dataset):
    col_means = list()
    
    for col in range(len(dataset[0])):
        col_items = [row[col] for row in dataset]
        
        col_mean = sum(col_items) / len(col_items)
        
        col_means.append(col_mean)
        
    return col_means

In [15]:
def get_col_std_devs(dataset, col_means):
    col_std_devs = [0 for col in range(len(dataset[0]))]
    
    for col in range(len(dataset[0])):
        variance = sum([pow(row[col] - col_means[col], 2) for row in dataset])
        
        col_std_devs[col] = sqrt(variance / float(len(dataset) - 1))
        
    return col_std_devs

In [16]:
col_means = get_col_means(dataset)
std_devs = get_col_std_devs(dataset, col_means)

In [17]:
def standardize(dataset, col_means, std_devs):
    for col in range(len(dataset[0])):
        for row in range(len(dataset)):
            dataset[row][col] = (dataset[row][col] - col_means[col]) / std_devs[col]

In [18]:
standardize(dataset, col_means, std_devs)

In [19]:
print(dataset[0])

[0.6395304921176576, 0.8477713205896718, 0.14954329852954296, 0.9066790623472505, -0.692439324724129, 0.2038799072674717, 0.468186870229798, 1.4250667195933604, 1.3650063669598067]
