# Prueba Clasificación uso de suelo

In [11]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pylab
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats
from scipy.ndimage import filters
import json
from IPython.display import clear_output
import map2geo2 as m2g
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier

rcParams['figure.figsize'] = 20, 15

### Performance functions:

1. Confusion matrix
2. MCC

In [60]:
def confusionMatrix(X,Y):
    c = np.zeros(4)
    tp = lambda x,y: 1 if y==1. and x==y else 0
    tn = lambda x,y: 1 if y==0. and x==y else 0
    fp = lambda x,y: 1 if y==1. and 0.==x else 0
    fn = lambda x,y: 1 if y==0. and 1.==x else 0
    
    t_tp = (np.sum(list(map(tp,X,Y))))
    t_tn = (np.sum(list(map(tn,X,Y))))
    t_fp = (np.sum(list(map(fp,X,Y))))
    t_fn = (np.sum(list(map(fn,X,Y))))
    
    return [t_tp,t_tn,t_fp,t_fn]


# Mathews Correlation Coefficient 
#https://en.wikipedia.org/wiki/Matthews_correlation_coefficient

mcc = lambda tp,tn,fp,fn: (tp*tn-fp*fn)/((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn*1.))**(0.5)

### Training Experiment

In [13]:
#the "raw" dataset
suelo = pd.read_csv('sampling_leon_full_wclasses.csv')

In [14]:
#filtering dataset, keeping just the complete rows
nonan = suelo[suelo["pendiente_n"].notna() & suelo["municipios"].notna() ]  

In [15]:
#extracting municipality codes
muncode = np.unique(nonan["municipios"])

In [21]:
#one hot encoding for qualitative data of municipalities codes
f = lambda x: (0,1) if x == muncode[0] else (1,0)
M = np.array(list(map(f , nonan["municipios"] )))

array([0, 0, 0, ..., 1, 1, 1])

In [22]:
#adding the municipality encode
nonan["mpio00"]=M[:,0]
nonan["mpio01"]=M[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [24]:
#ordering columns 
nonan01 = nonan[[
    'mpio00',
    'mpio01',
    'presion_0_5_2000c',
    'd_urbano_2000c_n',
    'd_industrias_n',
    'd_aguac_n',
    'd_areas_naturales_n',
    'd_vialidades_n',
    'd_via_ferrea_n',
    'road_dens_perc_n',
    'pendiente_n',
    'tiempo_viaje_ciudad_n_n',
    'urbano_2000_exclusion2b']]

In [25]:
#removing presion_0_5_2000c to test MLNN
nonan01 = nonan[['mpio00','mpio01','d_urbano_2000c_n','d_industrias_n','d_aguac_n','d_areas_naturales_n','d_vialidades_n','d_via_ferrea_n','road_dens_perc_n','pendiente_n','tiempo_viaje_ciudad_n_n','urbano_2000_exclusion2b']]

In [26]:
#extract recent urbanized cells
leon_urbanizado = nonan01[nonan01["urbano_2000_exclusion2b"]==1]

#extract no urbanized yet cells
leon_n_urbanizado = nonan01[nonan01["urbano_2000_exclusion2b"]==0]

In [27]:
#randomize data sets
leon_urbanizado_shuffle = shuffle(leon_urbanizado)
leon_n_urbanizado_shuffle = shuffle(leon_n_urbanizado)

#number of urbanized cells
nu = len(leon_urbanizado_shuffle)

#number of not urbanized cells
nnu = len(leon_n_urbanizado_shuffle)

print("el " + str(100*1.*nu/nnu) + "son celdas urbanizadas")

el 7.525147545126556son celdas urbanizadas


In [28]:
#Balanceo Naive, tomar de entrenamiento mismo numero de elementos celdas urbanizadas y no urbanizadas

# 70% for training
p = 0.7

# 70% from the smalles classs
tp = int(nu*p)

#building the training set
leon_urbanizado_shuffle_training = leon_urbanizado_shuffle[:tp]
leon_n_urbanizado_shuffle_training = leon_n_urbanizado_shuffle[:tp]
train = np.array(leon_urbanizado_shuffle_training.append(leon_n_urbanizado_shuffle_training))

#the remaing data is for the test set
leon_urbanizado_shuffle_prueba = leon_urbanizado_shuffle[tp:]
leon_n_urbanizado_shuffle_prueba = leon_n_urbanizado_shuffle[tp:]
test = np.array(leon_urbanizado_shuffle_prueba.append(leon_n_urbanizado_shuffle_prueba))

In [30]:
#comprobacion
print("Testing if 70% + 30% = 100%")
print(len(leon_urbanizado))
print(len(leon_urbanizado_shuffle_training)+len(leon_urbanizado_shuffle_prueba))

print(len(leon_n_urbanizado))
print(len(leon_n_urbanizado_shuffle_training)+len(leon_n_urbanizado_shuffle_prueba))

Testing if 70% + 30% = 100%
510967
510967
6790126
6790126


### fromating vectors for Machine learning procedures

In [31]:
ln = len(leon_urbanizado_shuffle_training.columns)

In [32]:
train[0,:-1]

array([0.        , 1.        , 0.00492774, 0.49950254, 0.25892357,
       0.34089655, 0.        , 0.27066667, 0.81232494, 0.03719533,
       0.0321856 ])

In [33]:
print(len(leon_urbanizado_shuffle_training))
print(len(leon_n_urbanizado_shuffle_training))
print(len(leon_urbanizado_shuffle_prueba))
print(len(leon_n_urbanizado_shuffle_prueba))

357676
357676
153291
6432450


In [53]:
train[:,:-1]

array([[0.        , 1.        , 0.00492774, ..., 0.81232494, 0.03719533,
        0.0321856 ],
       [0.        , 1.        , 0.09203446, ..., 0.42016807, 0.03506739,
        0.07191313],
       [0.        , 1.        , 0.1844178 , ..., 0.34453782, 0.09007043,
        0.08297508],
       ...,
       [0.        , 1.        , 0.47155372, ..., 0.08963586, 0.51790424,
        0.32159157],
       [0.        , 1.        , 0.29254137, ..., 0.        , 0.46103385,
        0.16718716],
       [0.        , 1.        , 0.33520764, ..., 0.        , 0.03920704,
        0.04906113]])

In [54]:
#inputs
sc = -1


X = train[:,:sc]

#outputs
Y = train[:,sc]



### Training a Small Deep Neural Network

In [55]:
#initializing object
clf = MLPClassifier(activation = "relu", solver='adam', alpha=1e-5,hidden_layer_sizes=(15,15,15), random_state=1)

#fitting model
clf.fit(X, Y)

#prob_prediction
prediction_proba = (clf.predict_proba(test[:,:sc]))  

#class prediction
prediction = (clf.predict(test[:,:sc]))  

#count missmatches
missmatches = sum(((prediction - test[:,sc]) **2)**(0.5))
print("Missmatches in %:" +str(100*missmatches/len(prediction)))

#We order the indexes from major to minor probability
sortedix = np.flip(np.argsort(prediction_proba[:,1]))
      
#We calculate the total number of urbanized cells in the test set      
tot_urb = int(np.sum(test[:,sc]))
      
#Generating a zeros vector, with the total number of cells      
z = np.zeros(np.shape(test)[0]) 
      
#we put 1 just to the limted number of urbanized cells      
z[sortedix[:tot_urb]]=1

#the general missmatches      

missmatches = (np.abs(test[:,sc] - z))
print(100*(np.sum(missmatches) / len(missmatches)))



Missmatches in %:2.9015413755263073
0.881237206261224


In [61]:
V = confusionMatrix(test[:,sc],z)
coef = mcc(V[0],V[1],V[2],1.*V[3])
print("Confusion matrix values: ",V)
print("Mathews Correlation Coefficient: ", coef)
print("accuracy rate ",1.*V[0]/tot_urb)

Confusion matrix values:  [124273, 6403432, 29018, 29018]
Mathews Correlation Coefficient:  0.8061887212673022
accuracy rate  0.8106999106274994


In [507]:
V = confusionMatrix(test[:,sc],z)
coef = mcc(V[0],V[1],V[2],V[3])
print("Confusion matrix values: ",V)
print("Coefficient: ", coef)
print("accuracy rate ",1.*V[0]/tot_urb)

('Confusion matrix values: ', [126269, 6405428, 27022, 27022])
('Coefficient: ', 0.8195200091696547)
('accuracy rate ', 0.823720896856306)


(6585741, 12)

In [508]:
#initializing object
clf = MLPClassifier(activation = "logistic", solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(1), random_state=1)

#fitting model
clf.fit(X, Y)

#prob_prediction
prediction_proba = (clf.predict_proba(test[:,:12]))  

#class prediction
prediction = (clf.predict(test[:,:12]))  

#count missmatches
missmatches = sum(((prediction - test[:,12]) **2)**(0.5))
print("Missmatches in %:" +str(100*missmatches/len(prediction)))

#We order the indexes from major to minor probability
sortedix = np.flip(np.argsort(prediction_proba[:,1]))
      
#We calculate the total number of urbanized cells in the test set      
tot_urb = int(np.sum(test[:,12]))
      
#Generating a zeros vector, with the total number of cells      
z = np.zeros(np.shape(test)[0]) 
      
#we put 1 just to the limted number of urbanized cells      
z[sortedix[:tot_urb]]=1

#the general missmatches      

      
missmatches = (np.abs(test[:,12] - z))
print(100*(np.sum(missmatches) / len(missmatches)))
      
V = confusionMatrix(test[:,12],z)
coef = mcc(V[0],V[1],V[2],V[3])
print("Confusion matrix values: ",V)
print("Coefficient: ", coef)
print("accuracy rate ",1.*V[0]/tot_urb)

Missmatches in %:7.849883559040661
2.106156315591518
('Confusion matrix values: ', [83938, 6363097, 69353, 69353])
('Coefficient: ', 0.5367911774088914)
('accuracy rate ', 0.5475729168705273)


### Training a single neuron with logistic function activation to emulate the Futures suitability function

In [509]:
clf = MLPClassifier(activation = "logistic", solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(1), random_state=1)
clf.fit(X, Y)

prediction_proba = (clf.predict_proba(test[:,:12]))  
prediction = (clf.predict(test[:,:12]))  
missmatches = sum(((prediction - test[:,12]) **2)**(0.5))
100*missmatches/len(prediction)


sortedix = np.flip(np.argsort(prediction_proba[:,1]))
tot_urb = int(np.sum(test[:,12]))
z = np.zeros(np.shape(test)[0]) 
z[sortedix[:tot_urb]]=1
missmatches = np.sum(np.abs(test[:,12] - z)) 
print(100*(missmatches / len(z)))

t_tp = np.sum(map(tp,test[:,12],z))
t_tn = np.sum(map(tn,test[:,12],z))
t_fp = np.sum(map(fp,test[:,12],z))
t_fn = np.sum(map(fn,test[:,12],z))
print(t_tp)
print(t_tn)
print(t_fp)
print(t_fn)

2.106156315591518


TypeError: 'int' object is not callable

### Training two separated logistic models for each Municipality

In [526]:
test[st,2:]

array([[0.        , 0.45894147, 0.23789426, ..., 0.        , 0.06886076,
        1.        ],
       [5.721718  , 0.01740168, 0.70929491, ..., 0.23141739, 0.05612981,
        1.        ],
       [1.550933  , 0.02951808, 0.71721477, ..., 0.28312916, 0.04823277,
        1.        ],
       ...,
       [0.        , 0.42561508, 0.01017903, ..., 0.18250068, 0.0645022 ,
        0.        ],
       [0.        , 0.7652828 , 0.50965664, ..., 0.08662436, 0.46173612,
        0.        ],
       [0.        , 0.34772556, 0.35128638, ..., 0.17713602, 0.21814387,
        0.        ]])

In [528]:
mpio=1

#selecting mun 0 for training set
d = 2

s = X[:,0]==mpio

#selecting mun 0 for testing
st = test[:,0]==mpio
testm1 = test[st,:]

#training neural network
clf = MLPClassifier(activation = "logistic", solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(1), random_state=1)
clf.fit(X[s,2:], Y[s])

prediction_proba = (clf.predict_proba(testm1[:,2:12]))  
prediction = (clf.predict(testm1[:,2:12])) 


missmatches = sum(((prediction - testm1[:,12]) **2)**(0.5))
100*missmatches/len(prediction)

sortedix = np.flip(np.argsort(prediction_proba[:,1]))
tot_urb = int(np.sum(testm1[:,12]))
z = np.zeros(np.shape(testm1)[0]) 
z[sortedix[:tot_urb]]=1
missmatches = np.sum(np.abs(testm1[:,12] - z)) 

V = confusionMatrix(testm1[:,12],z)
coef = mcc(V[0],V[1],V[2],1.*V[3])
print("Confusion matrix values: ",V)
print("Coefficient: ", coef)
print("accuracy rate ",1.*V[0]/tot_urb)

('Confusion matrix values: ', [7861, 2136006, 12169, 12169])
('Coefficient: ', 0.3867964995377045)
('accuracy rate ', 0.3924613080379431)


In [511]:
mpio=1

s = X[:,0] == mpio
st = test[:,0] == mpio

st = test[:,0] == mpio
testm1 = test[st,:]

#training neural network
clf = MLPClassifier(activation = "logistic", solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(1), random_state=1)
clf.fit(X[s,:], Y[s])

prediction_proba = (clf.predict_proba(testm1[:,:12]))  
prediction = (clf.predict(testm1[:,:12])) 

missmatches = sum(((prediction - testm1[:,12]) **2)**(0.5))
100*missmatches/len(prediction)

sortedix = np.flip(np.argsort(prediction_proba[:,1]))
tot_urb = int(np.sum(testm1[:,12]))
z = np.zeros(np.shape(testm1)[0]) 
z[sortedix[:tot_urb]]=1
missmatches = np.sum(np.abs(testm1[:,12] - z)) 

V = confusionMatrix(testm1[:,12],z)
coef = mcc(V[0],V[1],V[2],1.*V[3])
print("Confusion matrix values: ",V)
print("Coefficient: ", coef)
print("accuracy rate over positive values",1.*V[0]/tot_urb)

('Confusion matrix values: ', [7727, 2135872, 12303, 12303])
('Coefficient: ', 0.38004415595466995)
('accuracy rate over positive values', 0.38577134298552174)


APPENDIX

In [469]:
len(test)
np.sum(V)

K  = int(len(z)*1)
print(np.sum(map(fp,test[:K,12],z[:K])))
print(np.sum(map(fn,test[:K,12],z[:K])))




25190
25190


In [529]:
# https://www.pythoncentral.io/measure-time-in-python-time-time-vs-time-clock/
#https://intellipaat.com/community/17925/shuffle-dataframe-rows

In [530]:
import statsmodels.api as sm

In [531]:
import statsmodels.formula.api as smf

In [532]:
data = sm.datasets.get_rdataset("dietox", "geepack").data

In [533]:
data

Unnamed: 0,Weight,Feed,Time,Pig,Evit,Cu,Litter
0,26.50000,,1,4601,1,1,1
1,27.59999,5.200005,2,4601,1,1,1
2,36.50000,17.600000,3,4601,1,1,1
3,40.29999,28.500000,4,4601,1,1,1
4,49.09998,45.200001,5,4601,1,1,1
5,55.39999,56.900002,6,4601,1,1,1
6,59.59998,71.700005,7,4601,1,1,1
7,67.00000,86.800001,8,4601,1,1,1
8,76.59998,104.900002,9,4601,1,1,1
9,86.50000,123.000000,10,4601,1,1,1


In [536]:
data

Unnamed: 0,Weight,Feed,Time,Pig,Evit,Cu,Litter
0,26.50000,,1,4601,1,1,1
1,27.59999,5.200005,2,4601,1,1,1
2,36.50000,17.600000,3,4601,1,1,1
3,40.29999,28.500000,4,4601,1,1,1
4,49.09998,45.200001,5,4601,1,1,1
5,55.39999,56.900002,6,4601,1,1,1
6,59.59998,71.700005,7,4601,1,1,1
7,67.00000,86.800001,8,4601,1,1,1
8,76.59998,104.900002,9,4601,1,1,1
9,86.50000,123.000000,10,4601,1,1,1


In [535]:
groups

0      4601
1      4601
2      4601
3      4601
4      4601
5      4601
6      4601
7      4601
8      4601
9      4601
10     4601
11     4601
12     4643
13     4643
14     4643
15     4643
16     4643
17     4643
18     4643
19     4643
20     4643
21     4643
22     4643
23     4643
24     4756
25     4756
26     4756
27     4756
28     4756
29     4756
       ... 
831    8270
832    8270
833    8270
834    8270
835    8270
836    8270
837    8439
838    8439
839    8439
840    8439
841    8439
842    8439
843    8439
844    8439
845    8439
846    8439
847    8439
848    8439
849    8442
850    8442
851    8442
852    8442
853    8442
854    8442
855    8442
856    8442
857    8442
858    8442
859    8442
860    8442
Name: Pig, Length: 861, dtype: int64