# Create deep features dataset

In [1]:
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
df_rwanda = pd.read_csv('./data/rwanda.csv')

In [3]:
df_rwanda.head()

Unnamed: 0,cluster,latitude,longitude,wealth_index,mean_light,median_light,std_light,min_light,max_light
0,1,-2.532818,29.684726,-0.531405,0.06,0.0,0.596992,0.0,6.0
1,2,-1.833858,30.310689,-0.40983,0.0,0.0,0.0,0.0,0.0
2,3,-1.888155,29.478298,-0.478115,0.0,0.0,0.0,0.0,0.0
3,4,-2.366763,30.521692,-0.43596,0.0,0.0,0.0,0.0,0.0
4,5,-2.171266,30.018541,-0.44948,0.0,0.0,0.0,0.0,0.0


In [6]:
base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

In [7]:
folder_path = './data/daytime_images/rwanda/'

In [8]:
images = []
for img in os.listdir(folder_path):
    img = os.path.join(folder_path, img)
    img = image.load_img(img, target_size=(224, 224))
    img = image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    images.append(img)
images = np.vstack(images)

In [9]:
images = np.vstack(images)

In [10]:
features = model.predict(images)

In [11]:
features.shape

(492, 4096)

In [12]:
features = list(features)

In [13]:
df_features = pd.DataFrame({'id': df_rwanda['cluster'].values,
                            'features': features})

In [14]:
df_features.head()

Unnamed: 0,features,id
0,"[0.0, 0.0, 0.0, 0.0, 1.3173025, 0.0, 1.180438,...",1
1,"[0.0, 0.0, 0.0, 1.4845816, 2.3637195, 0.0, 1.4...",2
2,"[0.0, 1.2253599, 0.0, 4.373325, 2.237329, 0.0,...",3
3,"[0.0, 0.0, 0.0, 0.0, 1.5691879, 0.0, 1.3579657...",4
4,"[0.0, 0.0, 0.0, 2.745697, 2.8115597, 0.0, 0.07...",5


In [15]:
df_merged = pd.merge(df_rwanda, df_features, how='inner', left_on='cluster', right_on='id')[['cluster', 'latitude',
                                                                                            'longitude',
                                                                                            'wealth_index',
                                                                                            'mean_light',
                                                                                            'median_light',
                                                                                            'std_light',
                                                                                            'min_light',
                                                                                            'max_light',
                                                                                            'features']]
df_merged.head()

Unnamed: 0,cluster,latitude,longitude,wealth_index,mean_light,median_light,std_light,min_light,max_light,features
0,1,-2.532818,29.684726,-0.531405,0.06,0.0,0.596992,0.0,6.0,"[0.0, 0.0, 0.0, 0.0, 1.3173025, 0.0, 1.180438,..."
1,2,-1.833858,30.310689,-0.40983,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 1.4845816, 2.3637195, 0.0, 1.4..."
2,3,-1.888155,29.478298,-0.478115,0.0,0.0,0.0,0.0,0.0,"[0.0, 1.2253599, 0.0, 4.373325, 2.237329, 0.0,..."
3,4,-2.366763,30.521692,-0.43596,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 1.5691879, 0.0, 1.3579657..."
4,5,-2.171266,30.018541,-0.44948,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 2.745697, 2.8115597, 0.0, 0.07..."


In [16]:
df_merged.to_pickle('./data/rwanda_deep.csv')

# Predict from deep features

In [204]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

In [205]:
df_rwanda = pd.read_pickle('./data/rwanda_deep.csv')

X = np.array([df_rwanda['features'].values[i] for i in range(df_rwanda['features'].values.shape[0])])
X = np.concatenate([X, df_rwanda['mean_light'].values.reshape((-1, 1))], axis=1)
y = df_rwanda['wealth_index'].values.reshape((-1, 1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

In [206]:
from sklearn.linear_model import LinearRegression

estimator = LinearRegression()
estimator.fit(X_train, y_train)

print('MSE on train data :', round(((y_train - estimator.predict(X_train)) ** 2).mean()))
print('MSE on test data :', round(((y_test - estimator.predict(X_test)) ** 2).mean(), 2))
print('R2-score :', round(estimator.score(X_test, y_test), 2))

MSE on train data : 0.0
MSE on test data : 0.34
R2-score : 0.62


In [207]:
from sklearn.decomposition import PCA

pca = PCA(n_components=4)
pca.fit(X_train[:,:4096])

PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [208]:
X_train_sparse = pca.transform(X_train[:, :4096])
X_test_sparse = pca.transform(X_test[:, :4096])

X_train_sparse = np.concatenate([X_train_sparse, X_train[:,4096:]], axis=1)
X_test_sparse = np.concatenate([X_test_sparse, X_test[:,4096:]], axis=1)

print(X_train_sparse.shape, X_train.shape)

(295, 5) (295, 4097)


In [184]:
estimator = LinearRegression()
estimator.fit(X_train_sparse, y_train)

print('MSE on train data :', round(((y_train - estimator.predict(X_train_sparse)) ** 2).mean(), 2))
print('MSE on test data :', round(((y_test - estimator.predict(X_test_sparse)) ** 2).mean(), 2))
print('R2-score :', round(estimator.score(X_test_sparse, y_test), 2))

MSE on train data : 0.12
MSE on test data : 0.27
R2-score : 0.7


In [185]:
from sklearn.linear_model import Ridge, RidgeCV

estimator = RidgeCV(alphas=np.logspace(0, 6, 1000))
estimator.fit(X_train_sparse, y_train)

print('alpha = ', estimator.alpha_)

print('MSE on train data :', round(((y_train - estimator.predict(X_train_sparse)) ** 2).mean(), 2))
print('MSE on test data :', round(((y_test - estimator.predict(X_test_sparse)) ** 2).mean(), 2))
print('R2-score :', round(estimator.score(X_test_sparse, y_test), 2))

alpha =  2840.883690183304
MSE on train data : 0.15
MSE on test data : 0.24
R2-score : 0.73


In [186]:
from sklearn.linear_model import Lasso, LassoCV

estimator = LassoCV(alphas=np.logspace(-6, 7, 100))
estimator.fit(X_train_sparse, y_train)

print('alpha = ', estimator.alpha_)

print('MSE on train data :', round(((y_train - estimator.predict(X_train_sparse)) ** 2).mean(), 2))
print('MSE on test data :', round(((y_test - estimator.predict(X_test_sparse)) ** 2).mean(), 2))
print('R2-score :', round(estimator.score(X_test_sparse, y_test), 2))

alpha =  0.24201282647943834
MSE on train data : 1.02
MSE on test data : 1.37
R2-score : 0.77


  y = column_or_1d(y, warn=True)


In [187]:
from sklearn.feature_selection import SelectFromModel

model = SelectFromModel(estimator, prefit=True)
X_new = model.transform(X_train_sparse)
X_new.shape

(295, 5)

In [227]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=1)
X_train_pol = poly.fit_transform(X_train_sparse)
X_test_pol = poly.fit_transform(X_test_sparse)

In [228]:
from sklearn.linear_model import Ridge, RidgeCV

estimator = RidgeCV(alphas=np.logspace(0, 10, 1000))
estimator.fit(X_train_pol, y_train)

print('alpha = ', estimator.alpha_)

print('MSE on train data :', round(((y_train - estimator.predict(X_train_pol)) ** 2).mean(), 2))
print('MSE on test data :', round(((y_test - estimator.predict(X_test_pol)) ** 2).mean(), 2))
print('R2-score :', round(estimator.score(X_test_pol, y_test), 2))

alpha =  481.5957910192351
MSE on train data : 0.18
MSE on test data : 0.21
R2-score : 0.77
