#####Notebook for Data exploration

In [4]:
import numpy as np
import sklearn
import csv
from sklearn.feature_extraction import DictVectorizer

In [3]:
train_file = "WaterPump-training-values.csv"
train_labels = "WaterPump-training-labels.csv"
test_file = "WaterPump-test-values.csv"

def getData(lines=None, step=1, fileName="WaterPump-training-values.csv"):
    #method to import part or all of data from CSV into list
    #lines: how many lines to read; None means read to end of file
    #step: step size, so step=2 would read every 2nd line
    reader = csv.DictReader(open(fileName, 'rU'))
    result = []
    
    i=0
    #read to end of file, skipping lines if necessary
    if lines==None:
        for row in reader:
            if i%step==0:
                result.append(row)
            i += 1
    
    #read until specified number of lines are stored
    for row in reader:
        if i>=lines*step:
            break
        if i%step==0:
            result.append(row)
        i += 1
    
    return result

In [7]:
def intDate(date):
    #convert date string (2011-03-04) to int
    dates = date.split('-')
    #this is approximate formula, ignoring leap years and variance in month lengths
    return int(dates[0])*365 + int(dates[1])*30 + int(dates[2])

def quantData(data):
    #getData() reads everything as string, so this changes appropriate continuous variables to ints/floats
    #mutates given data
    #TODO: how to deal with dates? ('date_recorded')
    for row in data:
        row['longitude'] = float(row['longitude'])
        row['latitude'] = float(row['latitude'])
        row['gps_height'] = int(row['gps_height'])
        #these are categorical data and should not be converted
        #row['region_code'] = int(row['region_code'])
        #row['district_code'] = int(row['district_code'])
        row['amount_tsh'] = float(row['amount_tsh'])
        row['population'] = float(row['population'])
        row['construction_year'] = float(row['construction_year'])
        row['num_private'] = int(row['num_private'])  #what the heck is this??
        row['permit'] = row['permit'] == 'TRUE'
        row['public_meeting'] = row['public_meeting'] == 'TRUE'
        row['date_recorded'] = intDate(row['date_recorded'])
        
        #removing columns
        del row['recorded_by']  #only 1 unique value
        del row['quantity_group'] #redundant with 'quantity'
        #del row['source_type']
        del row['waterpoint_type_group']
        del row['payment_type']
        #need to confirm the following is redundant with 'region'
        #del row['region_code']
    return data

def vectorizeData(data):
    #vectorize data, data should be dictionary
    #http://nbviewer.ipython.org/gist/sarguido/7423289
    vec = DictVectorizer()
    vectData = vec.fit_transform(data).toarray()
    return vectData

In [8]:
rawData = getData(5)
print rawData[:1]

[{'extraction_type': 'gravity', 'region_code': '11', 'gps_height': '1390', 'recorded_by': 'GeoData Consultants Ltd', 'longitude': '34.93809275', 'construction_year': '1999', 'installer': 'Roman', 'id': '69572', 'scheme_management': 'VWC', 'scheme_name': 'Roman', 'management': 'vwc', 'quantity_group': 'enough', 'source_class': 'groundwater', 'source': 'spring', 'subvillage': 'Mnyusi B', 'public_meeting': 'True', 'num_private': '0', 'latitude': '-9.85632177', 'waterpoint_type_group': 'communal standpipe', 'basin': 'Lake Nyasa', 'extraction_type_class': 'gravity', 'waterpoint_type': 'communal standpipe', 'wpt_name': 'none', 'management_group': 'user-group', 'lga': 'Ludewa', 'source_type': 'spring', 'district_code': '5', 'ward': 'Mundindi', 'payment': 'pay annually', 'population': '109', 'date_recorded': '2011-03-14', 'extraction_type_group': 'gravity', 'region': 'Iringa', 'amount_tsh': '6000.0', 'quality_group': 'good', 'payment_type': 'annually', 'water_quality': 'soft', 'permit': 'False

In [9]:
cleanData = quantData(rawData)
print cleanData[:1]

[{'extraction_type': 'gravity', 'region_code': '11', 'gps_height': 1390, 'longitude': 34.93809275, 'construction_year': 1999.0, 'installer': 'Roman', 'id': '69572', 'scheme_management': 'VWC', 'scheme_name': 'Roman', 'management': 'vwc', 'source_class': 'groundwater', 'source': 'spring', 'subvillage': 'Mnyusi B', 'public_meeting': False, 'num_private': 0, 'latitude': -9.85632177, 'basin': 'Lake Nyasa', 'extraction_type_class': 'gravity', 'waterpoint_type': 'communal standpipe', 'wpt_name': 'none', 'management_group': 'user-group', 'lga': 'Ludewa', 'source_type': 'spring', 'district_code': '5', 'ward': 'Mundindi', 'payment': 'pay annually', 'population': 109.0, 'date_recorded': 734119, 'extraction_type_group': 'gravity', 'region': 'Iringa', 'amount_tsh': 6000.0, 'quality_group': 'good', 'water_quality': 'soft', 'permit': False, 'funder': 'Roman', 'quantity': 'enough'}]


In [10]:
vecData = vectorizeData(cleanData)
print vecData[:1]

[[  6.00000000e+03   1.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   1.99900000e+03   7.34119000e+05   0.00000000e+00
    0.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   1.00000000e+00   0.00000000e+00   1.39000000e+03
    0.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    0.00000000e+00   0.00000000e+00  -9.85632177e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    3.49380928e+01   0.00000000e+00   1.00000000e+00   0.00000000e+00
    0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    1.00000000e+00   0.00000000e+00   0.00000000e+00   1.09000000e+02
    0.00000000e+00   1.00000000e+00   0.00000000e+00   1.00000000e+00
    0.00000000e+00  