In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.neural_network import MLPClassifier
from sklearn import model_selection
from sklearn import metrics
from sklearn.decomposition import PCA
#from imblearn.over_sampling import SMOTE

pd.set_option('display.max_columns', 100)

In [2]:
def load_files(folder, *files):
    data = []
    for file in files:
        df = pd.read_csv(folder + '/' +file +'.csv')
        data.append(df)
    data = pd.concat(data, axis=0)
    return data

def category2onehot(df, col):
    df = pd.concat([df, pd.get_dummies(df[col], prefix=col)],axis=1)
    df.drop([col],axis=1, inplace=True)
    return df

### 1. Load the Dataset
An ideal machine learning model must able to make prediction from unseen data .i.e. future observation. Based on that principle we decided to use house data 2005, 2007, 2009 and 2011 as the training set and data 2013 as the test set

In [None]:
raw_train = load_files('data', 'thads2005', 'thads2007', 'thads2009', 'thads2011')
raw_test = load_files('data', 'thads2013n')
raw_test.head()

### 2. Attributes Selection
Each intances in the dataset contains 99 of parameters but most of the parameters are not well described. Therefore, we only use parameters described by dataset author. The descriptions of those attibutes are documented at 'Variables.docx' file

In [None]:
attributes = [attr[0] for attr in pd.read_csv('data/variables.csv').values.tolist()]
data_train = raw_train[attributes]
data_test = raw_test[attributes]
data_test.head()

Unnamed: 0,CONTROL,AGE1,METRO3,REGION,LMED,FMR,IPOV,BEDRMS,BUILT,STATUS,TYPE,VALUE,NUNITS,ROOMS,PER,ZINC2,ZADEQ,ZSMHC,STRUCTURETYPE,OWNRENT,UTILITY,OTHERCOST,COST06,COST08,COST12,COSTMED,ASSISTED
0,'100003130103',82,'3','1',73738,956,11067,2,2006,'1',1,40000,1,6,1,18021,'1',533,1,'1',169.0,213.75,648.588189,696.905247,803.050535,615.156712,-9
1,'100006110249',50,'5','3',55846,1100,24218,4,1980,'1',1,130000,1,6,4,122961,'1',487,1,'1',245.333333,58.333333,1167.640781,1324.671218,1669.643405,1058.988479,-9
2,'100006370140',53,'5','3',55846,1100,15470,4,1985,'1',1,150000,1,7,2,27974,'1',1405,1,'1',159.0,37.5,1193.393209,1374.582175,1772.627006,1068.025168,-9
3,'100006520140',67,'5','3',55846,949,13964,3,1985,'1',1,200000,1,6,2,32220,'1',279,1,'1',179.0,70.666667,1578.857612,1820.4429,2351.169341,1411.700224,-9
4,'100007130148',26,'1','3',60991,737,15492,2,1980,'1',1,-6,100,4,2,96874,'1',759,5,'2',146.0,12.5,759.0,759.0,759.0,759.0,0


### [TO DO] Fixing some Data Formating Issue and Cleanning
This should be done during data cleaning

In [None]:
data_train['METRO3'] = data_train['METRO3'].replace(['\'2\'', '\'3\'', '\'4\'', '\'5\'', '\'9\''], 'O')
data_train['METRO3'] = data_train['METRO3'].replace(['\'1\''], 'C')

data_test['METRO3'] = data_test['METRO3'].replace(['\'2\'', '\'3\'', '\'4\'', '\'5\'', '\'9\''], 'O')
data_test['METRO3'] = data_test['METRO3'].replace(['\'1\''], 'C')

for col in data_train:
    print(col)
    x = data_train[col].unique() 
    print(max(x), min(x), x if len(x)<10 else str(x[:10]) + '(Numeric Data)')

### 3. Formating Dataset
We drop 'CONTROL' attribute since it only used as the identifier for each intances in the dataset. 'ASSISTED' attribute then separated from the rest attributes of so we can use it as prediction target or label. Then we separate categorical and numerical atributes.

In [None]:
# Pair of data and label
train_x = data_train.drop('CONTROL', axis=1)
train_y = data_train.pop('ASSISTED')

test_x = data_test.drop('CONTROL', axis=1)
test_y = data_test.pop('ASSISTED')

# Separate categorical and numeric attributes
train_categorical = train_x.select_dtypes(include='object')
train_numeric = train_x.select_dtypes(exclude='object')

test_categorical = test_x.select_dtypes(include='object')
test_numeric = test_x.select_dtypes(exclude='object')

### 4. Label and Catergorical Attributes Preprocessing
We transform label and catergorical attributes into one hot vector

In [None]:
#Trasnform label to onehot vector
train_y = pd.get_dummies(train_y)
test_y = pd.get_dummies(test_y)

#Transform categorical attributes to one hot vector
for col in train_categorical.columns.values:
    train_categorical = category2onehot(train_categorical, col)

for col in test_categorical.columns.values:
    test_categorical = category2onehot(test_categorical, col)

### 5. Numerical Attributes Preprocessing
We simply perform z-score normalization over nummerical attributes

In [None]:
for col in train_numeric:
    mean = train_numeric[col].mean()
    std = train_numeric[col].std(ddof=0)
    train_numeric[col] = (train_numeric[col] - mean)/(std+1E-10)
    test_numeric[col] = (test_numeric[col] - mean)/(std+1E-10)  

### 6. Combine Preprocessed Nummerical and Catergorical Attributes
We combine preprocessed nummerical and catergorical attributes by concatenating them then we convert the dataframe into numpy array

In [None]:
train_x = pd.concat([train_numeric, train_categorical], axis=1)
test_x = pd.concat([test_numeric, test_categorical], axis=1)
test_x.head()

train_x, train_y = np.array(train_x), np.array(train_y)
test_x, test_y = np.array(test_x), np.array(test_y)

### 7. PCA Whitening
Obtain principal components of training dataset using pca.fit then transfrom both training and test set

In [None]:
pca = PCA(n_components=8)
pca.fit(train_x)

train_x = pca.transform(train_x)
test_x = pca.transform(test_x)

### [TO DO] Perform Experiment of Oversampling and Undersaling over Training set

In [None]:
# oversample = SMOTE()
# train_x, train_y = oversample.fit_resample(train_x, train_y)

### 8. Build and Train Classifier
In this case we use MLP with Adam optimization strategy and weight decay of 1E-4

In [None]:
clf = MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(32, 8), random_state=1)
clf.fit(train_x, train_y)

MLPClassifier(hidden_layer_sizes=(32, 8), random_state=1)

### 9. Evaluate Model Performance

In [None]:
#Evaluate trained MLP on test set
out = clf.predict(test_x)
result = metrics.classification_report(out, test_y, digits=4)
print(result)

              precision    recall  f1-score   support

           0     0.9995    0.9995    0.9995     40292
           1     0.9341    0.9136    0.9237     18022
           2     0.7675    0.8151    0.7906      6231

   micro avg     0.9578    0.9577    0.9578     64545
   macro avg     0.9003    0.9094    0.9046     64545
weighted avg     0.9588    0.9577    0.9582     64545
 samples avg     0.9578    0.9577    0.9577     64545



In [None]:
#Just print the confussion matrix
metrics.confusion_matrix(np.argmax(out, 1), np.argmax(test_y, 1))

array([[40279,    25,     1],
       [   11, 16464,  1543],
       [    0,  1138,  5074]], dtype=int64)