# Random Forest Model with PCA
In this notebook we transform categorical features to one-hot encoding and then perform PCA (so-called matrix factorisation) to come up with meaningful numeric representation of the data.

In [3]:
import sys
sys.path.append('..')
import warnings
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib notebook
plt.style.use('ggplot')

In [4]:
def eval_model(test_x,test_y, rfc):
    pred = rfc.predict(test_x)
    from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score

    print(confusion_matrix(test_y,pred))
    print('Acc: ',accuracy_score(test_y,pred))
    print('Kappa: ',cohen_kappa_score(test_y,pred))
    

In [5]:
def to_categorical(y, num_classes=None):
    """Converts a class vector (integers) to binary class matrix.

    E.g. for use with categorical_crossentropy.

    # Arguments
        y: class vector to be converted into a matrix
            (integers from 0 to num_classes).
        num_classes: total number of classes.

    # Returns
        A binary matrix representation of the input.
    """
    y = np.array(y, dtype='int').ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes))
    categorical[np.arange(n), y] = 1
    return categorical

def categorical_factorisation(data, test_data):
    '''
    User matrix factorisation to arrive at numerical representation that captures most variance.
    :param data: data
    :return: data, with principal component representation of each categorical feature.
    '''
    from sklearn.decomposition import PCA

    for feature in data.columns:
        if data[feature].dtype == 'O':
            le = preprocessing.LabelEncoder()
            le.fit(data[feature])
            num = le.transform(data[feature])
            one_hot = to_categorical(num)
            arity = one_hot.shape[-1]
            if arity > 100:
                continue
            max_components = int(np.min([10, np.ceil(arity/2)]))
            pca = PCA(n_components=max_components)
            components = pca.fit_transform(one_hot)
            component_names = ['{f}_{i}'.format(f=feature, i=i) for i in range(max_components)]
            new_features = pd.DataFrame(data=components, columns=component_names, index=data.index)
            data = pd.concat([data,new_features],axis=1)
            
            # now do the same for the test data, but re-apply the components from training.
            num = le.transform(test_data[feature])
            one_hot = to_categorical(num)
            components = pca.transform(one_hot)
            new_features = pd.DataFrame(data=components, columns=component_names, index=test_data.index)
            test_data = pd.concat([test_data,new_features],axis=1)
    return data, test_data

In [6]:
selected_features = [
    'gps_height',
    'latitude',
    'longitude',
    'population',
    'amount_tsh',
    'age_at_measurement',
    'payment_type',
    'management_group',
    'quality_group',
    'region',
    'basin',
    'extraction_type_class',
    'quantity_group',
    'waterpoint_type_group',
    'source_type',
    'source_class'
]

In [7]:
from data_loading import data_loading_pipeline
train_df, test_df = data_loading_pipeline('../data', selected_features = None)

train_df, test_df = categorical_factorisation(train_df, test_df)

Label distribution in training set:  Counter({0: 23519, 2: 16750, 1: 2922})
Label distribution in testing set:  Counter({0: 7870, 2: 5518, 1: 1009})


In [8]:
all_numeric_features = [c for c in train_df.columns if (train_df[c].dtype != 'O') and c != 'status_group']

train_x = train_df[all_numeric_features]
test_x = test_df[all_numeric_features]

train_y = train_df.status_group.as_matrix()
test_y = test_df.status_group.as_matrix()

# Use random forests to optimise the feature set using cross-validated recursive feature elimination.

In [None]:
algo = RandomForestClassifier(n_estimators=100)
selector = RFE(estimator=algo)

selector.fit(train_x, train_y)

print("Optimal number of features : %d" % selector.n_features_)

In [None]:
import numpy as np
print('Selected features: ')
print(np.array(all_numeric_features)[[s for s in selector.get_support()]])
print('\n\nDiscarded features: ')
print(np.array(all_numeric_features)[[not s for s in selector.get_support()]])

In [None]:
train_x = train_x[train_x.columns[selector.get_support()]]
test_x = test_x[test_x.columns[selector.get_support()]]

# Fit a large random forest classifier on selected features and evaluate it

In [None]:
np.random.seed(42)
initial_model = RandomForestClassifier(n_estimators=1000, n_jobs=4)
initial_model.fit(train_x, train_y)
eval_model(test_x,test_y, initial_model)

# Balancing for minority class by automatically adjusting  class weights

In [None]:
np.random.seed(42)
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=4,class_weight='balanced')
rfc.fit(train_x, train_y)
eval_model(test_x,test_y, rfc)

# Undersampling the majority class

In [None]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=0)
X_resampled, y_resampled = cc.fit_sample(train_x, train_y)

np.random.seed(42)
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=4)

rfc.fit(X_resampled, y_resampled)
eval_model(test_x,test_y, rfc)

# Minority class over-sampling

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(n_jobs=4,k=10)
train_x_smote, train_y_smote = smote.fit_sample(train_x,train_y)

np.random.seed(42)
rfc = RandomForestClassifier(n_estimators=1000, n_jobs=4)

rfc.fit(train_x_smote, train_y_smote)
eval_model(test_x,test_y, rfc)