In [2]:
# Needed modules
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import scipy as sp

In [3]:
# Import data and determine the number of samples
train_data = pd.read_csv('train.csv')
train_labels = pd.read_csv('train_label.csv')
N = train_data.shape[0]

In [582]:
plt.close("all")

In [1855]:
# Plot all well locations, NOT colored
locations = np.array([train_data['longitude'],train_data['latitude']])
locations = locations.transpose()
plt.plot(locations[:,0],locations[:,1],'ko')

[<matplotlib.lines.Line2D at 0x7f8a44843990>]

In [2275]:
# Plot all well locations, WITH COLOR
# NB: Note the spatial clustering!
status_groups = ['functional', 'functional needs repair', 'non functional']
for i,c in enumerate(['green','red','green']):
    x=train_data[train_labels['status_group']==status_groups[i]]['longitude']
    y=train_data[train_labels['status_group']==status_groups[i]]['latitude']
    plt.plot(x,y,'o',color=c,alpha=0.3)
plt.gca().set_aspect('equal')
plt.gca().set_xlim((29,41))

(29, 41)

In [5]:
# Obtain locations and statuses for wells with valid GPS locations
# NB: 3.1% of the wells are missing location data
has_gps = train_data['longitude']>1
locations = np.array([train_data['longitude'].values,
                      train_data['latitude'].values],dtype=np.float)
locations = locations.transpose()
statuses = train_labels['status_group'].values
is_functional = (statuses != 'non functional')
Ngps = statuses.shape[0]; print Ngps

59400


In [5]:
statuses[0:20]

array(['functional', 'functional', 'functional', 'non functional',
       'functional', 'functional', 'non functional', 'non functional',
       'non functional', 'functional', 'functional', 'functional',
       'functional', 'functional', 'functional', 'functional',
       'non functional', 'non functional', 'functional needs repair',
       'functional'], dtype=object)

In [6]:
is_functional[0:20]

array([ True,  True,  True, False,  True,  True, False, False, False,
        True,  True,  True,  True,  True,  True,  True, False, False,
        True,  True], dtype=bool)

In [7]:
0.1/2**5

0.003125

In [165]:
scale = 0.004 # characteristic radius of a cluster in GPS coordinates
q = 6.0/12
def gaussian(distances,s=scale):
    return np.exp(-np.power(distances,2.0)/(2*np.power(s,2.0)))

def adaptive_gaussian(distances):
    low = 0.001
    high = 0.1
    t = (low+high)/2.0
    for i in xrange(8):
        if (np.sum(np.exp(-np.power(distances,2)/(2.0*np.power(t,2)))) > k*q):
            high = t
        else:
            low = t
        t = (low+high)/2.0
    return np.exp(-np.power(distances,2)/(2.0*np.power(t,2)))

def linear_distance(distances):
    return np.max(distances)-distances
def inverse_distance(distances):
    return 1.0/distances

In [82]:
# Split the dataset into three chunks: training, testing, and validation
# Fraction t goes into training, (1-t)/2 go to testing and validation
t = 0.9
r = (1-t)/2
random_indices = np.arange(Ngps,dtype=np.int)
np.random.shuffle(random_indices) # in-place
train_ind = random_indices[0:int(t*N)]
test_ind = random_indices[int(t*N):]
#val_ind = random_indices[int((t+r)*Ngps):Ngps]

In [83]:
# Create and "train" the kNN classifier
k = 40
scale = 0.01
q = 6.0/k
weights = 'distance'
algorithm = 'auto'
kNN_classifier = KNeighborsClassifier(leaf_size=30,
   n_neighbors=k,weights=weights,algorithm=algorithm)
kNN_classifier.fit(locations[train_ind,:],statuses[train_ind])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=40, p=2, weights='distance')

In [84]:
predictions = kNN_classifier.predict(locations[test_ind,:])
correct = np.sum(predictions == statuses[test_ind])
print correct*1.0/test_ind.shape[0]
for state in np.unique(statuses):
    print "\t%s: %f" %(state, np.sum((predictions==state)&(statuses[test_ind]==state))*1.0/np.sum(statuses[test_ind]==state))

#for state in ['functional', 'functional needs repair','non functional']:
#    print "\t%s: %f" % (state, np.sum((predictions==state)&(statuses[test_ind]==state))*1.0/np.sum(statuses[test_ind]==state))

0.692255892256
	functional: 0.819048
	functional needs repair: 0.219626
	non functional: 0.608806


In [22]:
statuses

array(['functional', 'functional', 'functional', ..., 'functional',
       'functional', 'functional'], dtype=object)

In [2554]:
s = 0.02
#X,Y=np.meshgrid(np.linspace(np.min(locations[:,0]),np.max(locations[:,0]),1000),
#            np.linspace(np.min(locations[:,1]),np.max(locations[:,1]),1000))
X,Y=np.meshgrid(np.linspace(33.,34.,1000),
            np.linspace(-9.,-8.,1000))
P = np.zeros(X.shape)
C = np.zeros(X.shape)

#xx,yy = np.meshgrid(np.linspace(35.0, 37,   nx),
#                    np.linspace(-7.0, -9,   ny))
#probs = myforest.predict_proba(np.array([xx.ravel(),yy.ravel()]).T)[:,0]
#probs = np.reshape(probs,(nx,ny))

for i in xrange(X.shape[0]):
    P[:,i] = kNN_classifier.predict_proba(np.array([X[:,i],Y[:,i]]).T)[:,1]
    g = kNN_classifier.kneighbors(np.array([X[:,i],Y[:,i]]).T)
    C[:,i] = np.sum(np.exp(-np.power(g[0],2)/(2.0*np.power(s,2)))/k,axis=1)

In [2555]:
plt.figure()
plt.contourf(X,Y,P,[0.0,0.25, 0.5,0.75,1.0],cmap=plt.get_cmap('Greys')); plt.gca().set_aspect("equal")
plt.contour(X,Y,C,np.linspace(0.0,np.max(C),10),linewidths=2.0,cmap=plt.get_cmap('Blues'))
# Plot all well locations, WITH COLOR
# NB: Note the spatial clustering!
status_groups = ['functional', 'functional needs repair', 'non functional']
for i,c in enumerate(['green','green','red']):
    x=train_data[train_labels['status_group']==status_groups[i]]['longitude'][train_ind]
    y=train_data[train_labels['status_group']==status_groups[i]]['latitude'][train_ind]
    plt.plot(x,y,'o',color=c)
plt.gca().set_aspect('equal')
plt.gca().set_xlim((29,41))
plt.colorbar()

<matplotlib.colorbar.Colorbar instance at 0x7f8a6326f248>

In [2418]:
np.sum(gaussian(kNN_classifier.kneighbors([33.4,-9.0])[0]))/k

1.1793319945616944e-07

In [2309]:
kNN_classifier.predict_proba(locations[0,:])[0][1]

1.0

In [2411]:
validated = kNN_classifier.predict(locations[val_ind,:])
correct = np.sum(validated==statuses[val_ind])
print correct*1.0/val_ind.shape[0]

0.0


In [2054]:
vor = sp.spatial.Voronoi(locations[::100])
sp.spatial.voronoi_plot_2d(vor)
plt.show()

array([ True,  True, False, ..., False, False, False], dtype=bool)

In [589]:
plt.gcf().tight_layout()

In [167]:
a = plt.gca()

In [168]:
a.set_aspect('equal')

In [226]:
funders = train_data['funder'].unique() # already unique...
for i,f in enumerate(funders):
    funders[i] = str(f).lower()
funders = funders.astype(str)
# ... using numpy.unique(funders) yields the same shape!

In [232]:
# find number of wells from each funder
funders = train_data['funder'].unique() # unique funders
frequency = np.zeros(funders.shape[0],np.int)
for i,funder in enumerate(funders):
    

In [235]:
funders[0]

'Roman'

In [244]:
tdgbf=train_data.groupby(by='funder')

In [251]:
tdgbfs = tdgbf.size()

In [255]:
tdgbfs.sort(ascending=False)

In [271]:
funder_frequencies = train_data.groupby('funder').size()
funder_frequencies.sort(ascending=False)
funder_frequencies.apply(lambda f: f/(N*1.0))

In [284]:
# Given a category (column in train_data), return a Pandas series
# that has the percentage of each well status for each item
# e.g. frequencies('source_class') returns the percentage of each well type
# for 'groundwater', 'surface', and 'unknown'
def frequencies(category):
    cat_frequencies = train_data.groupby(category).size()
    cat_frequencies.sort(ascending=False)
    cat_percents = cat_frequencies.apply(lambda f: (100.0*f)/(N*1.0))
    return cat_percents

In [285]:
train_data.columns

Index([u'id', u'amount_tsh', u'date_recorded', u'funder', u'gps_height', u'installer', u'longitude', u'latitude', u'wpt_name', u'num_private', u'basin', u'subvillage', u'region', u'region_code', u'district_code', u'lga', u'ward', u'population', u'public_meeting', u'recorded_by', u'scheme_management', u'scheme_name', u'permit', u'construction_year', u'extraction_type', u'extraction_type_group', u'extraction_type_class', u'management', u'management_group', u'payment', u'payment_type', u'water_quality', u'quality_group', u'quantity', u'quantity_group', u'source', u'source_type', u'source_class', u'waterpoint_type', u'waterpoint_type_group'], dtype='object')

In [458]:
asdf = frequencies('waterpoint_type')

In [476]:
frequencies('waterpoint_type_group')

waterpoint_type_group
communal standpipe       58.291246
hand pump                29.441077
other                    10.740741
improved spring           1.319865
cattle trough             0.195286
dam                       0.011785
dtype: float64

In [557]:
category = 'amount_tsh'
status_groups = ['functional', 'functional needs repair', 'non functional']
freq = frequencies(category)
plt.figure()
ratios = np.zeros((freq.shape[0],3),dtype=np.float)
rerrs = np.zeros(ratios.shape,dtype=np.float)
for i,label in enumerate(freq.axes[0]):
    a=train_labels[train_data[category]==label].groupby('status_group').size()
    missing = np.setdiff1d(status_groups,list(a.axes[0]))
    a = a.append(pd.Series(np.zeros(missing.shape),missing))
    ratios[i,:] = np.array([a['functional'],a['functional needs repair'],a['non functional']],dtype=np.float)/np.sum(a)
    rerrs[i,:] = np.sqrt(ratios[i,:]*np.sum(a))/np.sum(a)
for j,c in enumerate(['green','red','yellow']):
    plt.bar(np.arange(0,i+1)+0.3*j,ratios[:,j],width=0.3,color=c,yerr=rerrs[:,j],error_kw=dict(ecolor='black'))
plt.gca().xaxis.set_ticklabels(freq.axes[0])
plt.gca().set_xticks(np.arange(0,i+1)+0.5)
plt.gca().set_title(category).set_fontsize(18)

In [558]:
# Produces a bar chart that visually compares the percentage of each 
# well type for each label appearing in the given category (see frequencies)
# NB: Categories appear in descending order of how many times they occur
def category_bars(category):
    status_groups = ['functional', 'functional needs repair', 'non functional']
    freq = frequencies(category)
    plt.figure()
    ratios = np.zeros((freq.shape[0],3),dtype=np.float)
    rerrs = np.zeros(ratios.shape,dtype=np.float)
    for i,label in enumerate(freq.axes[0]):
        a=train_labels[train_data[category]==label].groupby('status_group').size()
        missing = np.setdiff1d(status_groups,list(a.axes[0]))
        a = a.append(pd.Series(np.zeros(missing.shape),missing))
        ratios[i,:] = np.array([a['functional'],a['functional needs repair'],a['non functional']],dtype=np.float)/np.sum(a)
        rerrs[i,:] = np.sqrt(ratios[i,:]*np.sum(a))/np.sum(a)
    for j,c in enumerate(['green','red','yellow']):
        plt.bar(np.arange(0,i+1)+0.3*j,ratios[:,j],width=0.3,color=c,yerr=rerrs[:,j],error_kw=dict(ecolor='black'))
    plt.gca().xaxis.set_ticklabels(freq.axes[0])
    plt.gca().set_xticks(np.arange(0,i+1)+0.5)
    plt.gca().set_title(category).set_fontsize(18)

In [565]:
# For a Python list of all categories
cats = []
for cat in train_data.columns:
    cats += [cat]

In [566]:
cats

['id',
 'amount_tsh',
 'date_recorded',
 'funder',
 'gps_height',
 'installer',
 'longitude',
 'latitude',
 'wpt_name',
 'num_private',
 'basin',
 'subvillage',
 'region',
 'region_code',
 'district_code',
 'lga',
 'ward',
 'population',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'scheme_name',
 'permit',
 'construction_year',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']

In [567]:
# A list of all feature columns that are unique, have more than one
# value, and have more than one well per value
# NB: Many of these are probably unsuitable for direct use
cats = [
 'funder',
 'installer',
 'basin',
 'region',
 'region_code',
 'district_code',
 'population',
 'public_meeting',
 'scheme_management',
 'scheme_name',
 'permit',
 'construction_year',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']

In [568]:
# Generate bar charts for each category
# Warning: Creates a lot of figures!
# Warning: Takes a minute or two... some charts have thousands of bars!
for cat in cats:
    category_bars(cat)



In [1827]:
train_labels.groupby('status_group').count()/(1.0*N)

Unnamed: 0_level_0,id
status_group,Unnamed: 1_level_1
functional,0.543081
functional needs repair,0.072677
non functional,0.384242


In [2287]:
x = np.linspace(0,1,5)
y = np.linspace(10,11,5)
X,Y = np.meshgrid(x,y)

In [2288]:
X

array([[ 0.  ,  0.25,  0.5 ,  0.75,  1.  ],
       [ 0.  ,  0.25,  0.5 ,  0.75,  1.  ],
       [ 0.  ,  0.25,  0.5 ,  0.75,  1.  ],
       [ 0.  ,  0.25,  0.5 ,  0.75,  1.  ],
       [ 0.  ,  0.25,  0.5 ,  0.75,  1.  ]])

In [2289]:
plt.figure(); plt.contour(X,Y,X+Y)

<matplotlib.contour.QuadContourSet instance at 0x7f8a41e3e680>

In [5]:
plt.figure()
splitby = 'basin'
plt.figure(); plt.gca().set_aspect('equal')
segments = train_data[splitby].unique()
has_gps = train_data['longitude']>0.1
for s in segments:
    mask = ((train_data[splitby]==s) & has_gps)
    x = train_data[mask]['longitude']
    y = train_data[mask]['latitude']
    plt.plot(x,y,'o',markeredgecolor='k',alpha=0.3,markersize=4)
    print "%s" %(s)
plt.legend(segments)
plt.set_cmap(plt.get_cmap('Greys'))

Lake Nyasa
Lake Victoria
Pangani
Ruvuma / Southern Coast
Internal
Lake Tanganyika
Wami / Ruvu
Rufiji
Lake Rukwa


In [2564]:
basins

array(['Lake Nyasa', 'Lake Victoria', 'Pangani', 'Ruvuma / Southern Coast',
       'Internal', 'Lake Tanganyika', 'Wami / Ruvu', 'Rufiji', 'Lake Rukwa'], dtype=object)