# Testing the random forest CGPM

In [1]:
import numpy as np
import pandas as pd

In [2]:
prng = np.random.RandomState(10)

#### Prepare the population schema

In [3]:
schema = [
    ('categorical', {'k': 79}),  # 0 Country_of_Operator
    ('categorical', {'k': 346}), # 1 Operator_Owner
    ('categorical', {'k': 18}),  # 2 Users
    ('categorical', {'k': 46}),  # 3 Purpose
    ('categorical', {'k': 4}),   # 4 Class_of_Orbit
    ('categorical', {'k': 7}),   # 5 Type_of_Orbit
    ('normal', None),            # 6 Perigee_km
    ('normal', None),            # 7 Apogee_km
    ('normal', None),            # 8 Eccentricity
    ('normal', None),            # 9 Period_minutes
    ('normal', None),            # 10 Launch_Mass_kg
    ('normal', None),            # 11 Dry_Mass_kg
    ('normal', None),            # 12 Power_watts
    ('normal', None),            # 13 Date_of_Launch
    ('normal', None),            # 14 Anticipated_Lifetime
    ('categorical', {'k': 282}), # 15 Contractor
    ('categorical', {'k': 54}),  # 16 Country_of_Contractor
    ('categorical', {'k': 25}),  # 17 Launch_Site
    ('categorical', {'k': 141}), # 18 Launch_Vehicle
    ('categorical', {'k': 38}),  # 19 Source_Used_for_Orbital_Data
    ('normal', None),            # 20 longitude_radians_of_geo
    ('normal', None),            # 21 Inclination_radians
]

#### Load observations from .csv file

In [4]:
from sklearn.preprocessing import Imputer

In [5]:
df = pd.read_csv('/tmp/satellites.coded.csv', index_col=False)
variables = range(len(df.columns))

In [6]:
target = 2
covariates = [v for v in variables if v != target]

In [7]:
X = (df.values)[:,covariates]
y = (df.values)[:,target]

In [8]:
def impute_column(X, i):
    stattype, _distargs = schema[i]
    strategy = 'mean' if stattype is 'normal' else 'most_frequent'
    imputer = Imputer(strategy=strategy)
    X_reshape = np.reshape(X[:,i], (-1,1))
    X_imputed = imputer.fit_transform(X_reshape)
    return np.ravel(X_imputed)

In [9]:
for i, _covariate in enumerate(covariates):
    X[:,i] = impute_column(X, i)

#### Run classification using sklearn forest.

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [11]:
def get_predictive_accuracy():
    rowids = np.arange(len(y))
    train_rows = prng.choice(rowids, replace=False, size=800)
    test_rows = rowids[~np.isin(rowids, train_rows)]
    X_train = X[train_rows]
    y_train = y[train_rows]
    X_test = X[test_rows]
    y_test = y[test_rows]
    seed = prng.randint(2**32-1,)
    forest = RandomForestClassifier(random_state=seed)
    forest.fit(X_train, y_train)
    y_prediction = forest.predict(X_test)
    return accuracy_score(y_prediction, y_test)

In [12]:
accuracies0 = [get_predictive_accuracy() for _i in xrange(10)]

#### Run classification using cgpm forest.

In [13]:
from cgpm2.forest import RandomForest
from collections import Counter

In [14]:
def get_predictive_accuracy_cgpm():
    # Test set.
    rowids = np.arange(len(y))
    train_rows = prng.choice(rowids, replace=False, size=800)
    test_rows = rowids[~np.isin(rowids, train_rows)]
    X_train = X[train_rows]
    y_train = y[train_rows]
    X_test = X[test_rows]
    y_test = y[test_rows]
    # Build forest.
    levels = [s[1]['k'] if s[1] else None for (i, s) in enumerate(schema) if i != target]
    assert len(levels) == len(covariates)
    distargs = {'k':18, 'levels':levels}
    forest = RandomForest(outputs=[target], inputs=covariates, distargs=distargs, rng=prng)
    # Incorporate data.
    for rowid, (value_output, values_inputs) in enumerate(zip(y_train, X_train)):
        inputs = dict(zip(covariates, values_inputs))
        observation = {target: value_output}
        forest.incorporate(rowid, observation ,inputs)
    forest.transition()
    # Fetch predictions (rebuild for testing only).
    metadata = forest.to_metadata()
    forest = RandomForest.from_metadata(metadata, prng)
    y_prediction = []
    for (value_output, values_inputs) in zip(y_test, X_test):
        inputs = dict(zip(covariates, values_inputs))
        samples_dict = forest.simulate(None, forest.outputs, inputs=inputs, N=100)
        samples_list = [s[target] for s in samples_dict]
        counter = Counter(samples_list)
        prediction = counter.most_common(1)[0][0]
        y_prediction.append(prediction)
    return accuracy_score(y_prediction, y_test)

In [15]:
accuracies1 = [get_predictive_accuracy_cgpm() for _i in xrange(10)]

In [16]:
accuracies1

[0.8228882833787466,
 0.8119891008174387,
 0.784741144414169,
 0.7956403269754768,
 0.8010899182561307,
 0.7983651226158038,
 0.8147138964577657,
 0.8174386920980926,
 0.8147138964577657,
 0.782016348773842]