In [12]:
%matplotlib inline
import numpy as np
import pandas as pd
from collections import Counter
import sklearn.cluster
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.optimize import minimize
import ot
from functools import reduce
from Matcher import Matcher, train_parameters, model_parameters
from sklearn.model_selection import KFold
from utils import computeMAE, computeRMSE
from scipy import sparse
from fastFM import als
from surprise import Dataset
from surprise import Reader
from surprise import SVD, SVDpp, NMF, KNNWithMeans, KNNBasic, KNNWithZScore, NormalPredictor, SlopeOne, accuracy
from surprise.model_selection import cross_validate
import warnings
warnings.filterwarnings("ignore")

In [13]:
def preProcessing(year):
    # clean hhi dataset
    hhi = pd.read_csv("DHS data/csv/hhi" + str(year) + ".csv")
    hhi = hhi[['nohhold', 'nomem', 'geslacht', 'positie', 'oplmet']]
    hhi.columns = ['household_index', 'member_index', 'gender', 'relationship', 'education']
    hhi['gender'] = hhi['gender'].apply(lambda x: 'M' if x == 1 else 'F')
    hhi = hhi[(hhi['relationship'] == 1) | (hhi['relationship'] == 2)]
    hhi = hhi[hhi['education'] != 9]
    def transformEducation(x):
        if x in {'6', '7'}:
            return 'High'
        if x in {'3', '4', '5'}:
            return 'Middle'
        if x in {'1', '2'}:
            return 'Low'
        if x == '8':
            return 'Uneducated'
    hhi['education'] = hhi['education'].apply(transformEducation)
    hhi = hhi.groupby('household_index').filter(lambda x: len(x) > 1)
    hhi['index'] = 100 * hhi['household_index'] + hhi['member_index']
    hhi = hhi.set_index('index')
    hhi.dropna(inplace=True)
    
    # clean inc dataset
    inc = pd.read_csv("DHS data/csv/inc" + str(year) + ".csv")
    inc = inc[['nohhold', 'nomem', 'gez1', 'gez2', 'gez3']]
    inc.columns = ['household_index', 'member_index', 'height', 'weight', 'health']
    inc['index'] = 100 * inc['household_index'] + inc['member_index']
    inc = inc.set_index('index')
    inc.dropna(inplace=True)

    # clean psy dataset
    psy = pd.read_csv("DHS data/csv/psy" + str(year) + ".csv")
    psy = psy[['nohhold', 'nomem', 
               'con04', 'con06', 'con08',
               'con03', 'con10', 'con09',
               'con05'
               ]]
    psy.columns = ['household_index', 'member_index',
                   'irresponsible', 'accurate', 'ever-ready',
                   'disciplined', 'ordered', 'clumsy',
                   'detail-oriented']
    for feature in ['irresponsible', 'accurate', 'ever-ready',
                   'disciplined', 'ordered', 'clumsy',
                   'detail-oriented']:
        psy[feature] = pd.to_numeric(psy[feature], errors='coerce')
    psy['index'] = 100 * psy['household_index'] + psy['member_index']
    psy = psy.set_index('index')
    psy.dropna(inplace=True)
    
    # join three datasets and remove duplicate columns
    df = hhi.join(inc, rsuffix='_inc').join(psy, rsuffix='_psy')
    df.dropna(inplace=True)
    df = df.groupby('household_index').filter(lambda x: len(x) == 2)
    del df['household_index_inc'], df['household_index_psy'], df['member_index_inc'], df['member_index_psy']
    df.reset_index(inplace=True)
    del df['index']
    household_index = reduce(lambda x, y: x + y, [[i, i] for i in range((year - 2005)*1000, (year - 2005)*1000 + len(df) // 2)])
    df['household_index'] = household_index
    
    df.to_csv('DHS data/cleaned/' + str(year) + '.csv')

In [14]:
# clean all datasets (excluding 2008)
for year in range(2005, 2015):
    if year == 2008:
        continue
    preProcessing(year)

In [15]:
# merge dataset of different years into one
dfs = []
for year in range(2005, 2015):
    if year == 2008:
        continue
    df = pd.read_csv('DHS data/cleaned/' + str(year) + '.csv')
    del df['Unnamed: 0']
    dfs.append(df)
df = pd.concat(dfs)
df = df[(df['height'] > 100) & (df['height'] < 200)]
df = df[df['weight'] > 10]
df.to_csv('DHS data/cleaned/cleaned.csv')

In [16]:
df = pd.read_csv('DHS data/cleaned/cleaned.csv')
df = df.groupby('household_index').filter(lambda x: len(x)>1)
del df['Unnamed: 0']
len(df)

4950

In [17]:
def normailizeEducation(edu):
    if edu == 'High':
        return 1.0
    if edu == 'Middle':
        return 2.0 / 3
    if edu == 'Low':
        return 1.0 / 3
    if edu == 'Uneducated':
        return 0.0
df['education'] = df['education'].apply(normailizeEducation)
df['height'] = df['height'] / df['height'].max()
df['weight'] = df['weight'] /df['weight'].max()
df['health'] = (6.0 - df['health']) / 5.0

for feature in ['irresponsible', 'accurate', 'ever-ready',
                   'disciplined', 'ordered', 'clumsy',
                   'detail-oriented']:
    df[feature] = df[feature] / 5.0

df = df.groupby('household_index').filter(lambda group: len(group) == 2)
df = df.groupby('household_index').filter(lambda group: len(group[group.gender == 'M']) == 1)
df.to_csv('DHS data/normalized/normalized.csv')

In [18]:
def cluster(n_clusters):
    df['label'] = np.nan
    M = df[df['gender'] == 'M']
    F = df[df['gender'] == 'F']
    cluster_M = sklearn.cluster.KMeans(n_clusters)
    cluster_F = sklearn.cluster.KMeans(n_clusters)
    cluster_M.fit(M[['education', 'height', 'weight', 'health', 'irresponsible', 'accurate', 'ever-ready', 'disciplined', 'ordered', 'clumsy', 'detail-oriented']])
    cluster_F.fit(F[['education', 'height', 'weight', 'health', 'irresponsible', 'accurate', 'ever-ready', 'disciplined', 'ordered', 'clumsy', 'detail-oriented']])
    df.loc[df.gender == 'M', 'label'] = cluster_M.labels_
    df.loc[df.gender == 'F', 'label'] = cluster_F.labels_
    df['label'] = df['label'].astype(int)
    U0 = cluster_M.cluster_centers_.T
    V0 = cluster_F.cluster_centers_.T
    return U0, V0

n_clusters = 50
U0, V0 = cluster(n_clusters)

In [19]:
dataset = []
for i in range(len(df) // 2):
    if df.iloc[2*i]['gender'] == 'M' and df.iloc[2*i+1]['gender'] == 'F':
        dataset.append(list(df.iloc[2*i][4:]) + list(df.iloc[2*i+1][4:]))
    else:
        dataset.append(list(df.iloc[2*i+1][4:]) + list(df.iloc[2*i][4:]))
dataset = np.asarray(dataset)

In [20]:
def get_data(dataset, n_clusters):
    m = n_clusters
    pi_sample = np.zeros((m, m))

    for k in range(len(dataset)):
        i = int(dataset[k][11])
        j = int(dataset[k][23])
        pi_sample[i][j] += 1

    pi_sample /= len(dataset)
    
    rating = []
    for i in range(m):
        for j in range(m):
            rating.append([i, j, pi_sample[i][j]])
        
    X, Y = [], []
    for i in range(m):
        for j in range(m):
            X.append(np.append(U0[:,i].reshape(11), V0[:,j].reshape(11)))
            Y.append(pi_sample[i][j])
        
    return pi_sample, rating, sparse.csr_matrix(np.asarray(X)), np.asarray(Y)

In [36]:
n_fold = 5
kf = KFold(n_fold, random_state=3)

metric_Random = np.zeros(2)
metric_PMF = np.zeros(2)
metric_SVD = np.zeros(2)
metric_itemKNN = np.zeros(2)
metric_RiOT = np.zeros(2)
metric_FM = np.zeros(2)


for fold, (train_index, test_index) in enumerate(kf.split(dataset)):
    train_dataset, test_dataset = dataset[train_index], dataset[test_index] 
    train_pi_sample, train_rating, train_X, train_Y = get_data(train_dataset, n_clusters)
    test_pi_sample, test_rating, test_X, test_Y = get_data(test_dataset, n_clusters)
    
    labels = ['user', 'item', 'rating']
    reader = Reader(rating_scale=(0, 2))
    df = pd.DataFrame.from_records(train_rating, columns=labels)
    train_surprise = Dataset.load_from_df(df[['user', 'item', 'rating']], reader).build_full_trainset()
    df = pd.DataFrame.from_records(test_rating, columns=labels)
    test_surprise = Dataset.load_from_df(df[['user', 'item', 'rating']], reader).build_full_trainset().build_testset()

    # Random
    algo = NormalPredictor()
    algo.fit(train_surprise)
    predictions = algo.test(test_surprise)

    metric_Random[0] += accuracy.rmse(predictions, verbose=False)
    metric_Random[1] += accuracy.mae(predictions, verbose=False)

    # PMF
    algo = SVD(biased=False)
    algo.fit(train_surprise)
    predictions = algo.test(test_surprise)

    metric_PMF[0] += accuracy.rmse(predictions, verbose=False)
    metric_PMF[1] += accuracy.mae(predictions, verbose=False)

    # SVD
    algo = SVD()
    algo.fit(train_surprise)
    predictions = algo.test(test_surprise)

    metric_SVD[0] += accuracy.rmse(predictions, verbose=False)
    metric_SVD[1] += accuracy.mae(predictions, verbose=False)
    
    # ItemKNN
    algo = KNNWithMeans(k=50, sim_options = {'user_based': False}, verbose=False)
    algo.fit(train_surprise)
    predictions = algo.test(test_surprise)

    metric_itemKNN[0] += accuracy.rmse(predictions, verbose=False)
    metric_itemKNN[1] += accuracy.mae(predictions, verbose=False)   

    # RiOT
    model = Matcher(train_pi_sample, U0, V0, r=5)
    lam = 1
    model_param = model_parameters(A0=np.eye(11, 11),
                                          gamma=0.2,
                                          const=1,
                                          degree=2,
                                          lam=lam,
                                          lambda_mu=1,
                                          lambda_nu=1,
                                          delta=0.005)
    train_param = train_parameters(max_outer_iteration=20,
                                          max_inner_iteration=10,
                                          learning_rate=0.01)
    model.riot(model_param, train_param)
    test_r, test_c = test_pi_sample.sum(axis=1), test_pi_sample.sum(axis=0)
    pred_pi = ot.rot(model.C, test_r, test_c, lam)[0]
    error_OT = (pred_pi - test_pi_sample).reshape(1, n_clusters*n_clusters)
    metric_RiOT += np.array([computeRMSE(error_OT), computeMAE(error_OT)])
    
    # factorization machine
    fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=15, l2_reg_w=0.1, l2_reg_V=0.5)
    fm.fit(train_X,train_Y)
    pred_Y = fm.predict(test_X)
    error_FM = pred_Y - test_Y
    metric_FM += np.array([computeRMSE(error_FM), computeMAE(error_FM)])
    
    metric_PMF /= n_fold
    metric_SVD /= n_fold
    metric_itemKNN /= n_fold
    metric_RiOT /= n_fold
    metric_FM /= n_fold
        
print(' model   | RMSE   |  MAE')
print(' Random  | {:.5f} |  {:.5f}'.format(metric_Random[0], metric_Random[1]))
print(' PMF     | {:.5f} |  {:.5f}'.format(metric_PMF[0], metric_PMF[1]))
print(' SVD     | {:.5f} |  {:.5f}'.format(metric_SVD[0], metric_SVD[1]))
print(' itemKNN | {:.5f} |  {:.5f}'.format(metric_itemKNN[0], metric_itemKNN[1]))
print(' RiOT    | {:.5f} |  {:.5f}'.format(metric_RiOT[0], metric_RiOT[1]))
print(' FM      | {:.5f} |  {:.5f}'.format(metric_FM[0], metric_FM[1]))

 model   | RMSE   |  MAE
 Random  | 0.00540 |  0.00362
 PMF     | 0.00715 |  0.00335
 SVD     | 0.01105 |  0.00626
 itemKNN | 0.00024 |  0.00016
 RiOT    | 0.00023 |  0.00015
 FM      | 0.00095 |  0.00076
