In [40]:
import pandas as pd
import numpy as np
%matplotlib inline
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')
import matplotlib
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler, SMOTE


plt.style.use('ggplot')


s=100
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (18,8)

In [41]:
df = pd.read_csv('datasets/preprocessed_final.csv')

In [42]:
X = df.drop(columns=['STAT_CAUSE_CODE', 'STAT_CAUSE_DESCR'])
y = df['STAT_CAUSE_DESCR']

X_train, X_test, y_train, y_test = train_test_split(X, y)

## Baselining

#### Predicting the most common train class

In [43]:
rf= RandomForestClassifier(n_estimators=10, max_depth=2)
rf.fit(X_train,y_train)
print(classification_report(y_test,rf.predict(X_test)))

                   precision    recall  f1-score   support

            Arson       0.00      0.00      0.00       235
         Campfire       0.00      0.00      0.00        63
         Children       0.00      0.00      0.00        47
   Debris Burning       0.30      0.79      0.43       358
    Equipment Use       0.00      0.00      0.00       110
        Fireworks       0.00      0.00      0.00        11
        Lightning       0.53      0.72      0.61       246
    Miscellaneous       0.38      0.34      0.36       283
Missing/Undefined       1.00      0.19      0.32       142
        Powerline       0.00      0.00      0.00        11
         Railroad       0.00      0.00      0.00        18
          Smoking       0.00      0.00      0.00        42
        Structure       0.00      0.00      0.00         2

         accuracy                           0.37      1568
        macro avg       0.17      0.16      0.13      1568
     weighted avg       0.31      0.37      0.29      

### find best classifier for our problem

In [44]:
def find_classifier(X_train, y_train, X_test, y_test):
    names = [
        "Nearest Neighbors",
        "Random Forest",
        "Neural Net",
        "AdaBoost"
    ]

    classifiers = [
        KNeighborsClassifier(3),
        RandomForestClassifier(),
        MLPClassifier(alpha=1, max_iter=1000),
        AdaBoostClassifier(),
    ]

    for idx, clf in enumerate(classifiers):
        print(f"Starting Classifier {names[idx]}...")
        clf.fit(X_train, y_train)
        clf.predict(X_test)
        preds = clf.predict(X_test)
        print(names[idx] + " Evaluation:")
        print(classification_report(y_test, preds))



In [45]:
find_classifier(X_train, y_train, X_test, y_test)

Starting Classifier Nearest Neighbors...
Nearest Neighbors Evaluation:
                   precision    recall  f1-score   support

            Arson       0.29      0.47      0.36       235
         Campfire       0.08      0.08      0.08        63
         Children       0.05      0.06      0.06        47
   Debris Burning       0.38      0.44      0.41       358
    Equipment Use       0.14      0.12      0.13       110
        Fireworks       0.20      0.09      0.13        11
        Lightning       0.65      0.64      0.64       246
    Miscellaneous       0.41      0.24      0.30       283
Missing/Undefined       0.84      0.73      0.78       142
        Powerline       0.00      0.00      0.00        11
         Railroad       0.30      0.17      0.21        18
          Smoking       0.00      0.00      0.00        42
        Structure       0.00      0.00      0.00         2

         accuracy                           0.40      1568
        macro avg       0.26      0.23    

# Find best parameters using random search and cross validation

In [46]:
def get_random_search_parameters():

    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = np.linspace(10, 100, num = 5,dtype=int).tolist() + [None]
    # Minimum number of samples required to split a node
    min_samples_split = [2, 4,8, 20,40]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [2,4, 10,20]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]


    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap,
                   'random_state': [666]}
    return random_grid

def parameter_search_classifier(X_train, y_train):
    random_grid = get_random_search_parameters()
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2,
                                   random_state=666, n_jobs=-1)
    # Fit the random search model
    rf_random.fit(X_train, y_train)

    best_params = rf_random.best_params_
    return best_params

# trying models with cross validation and PCA

In [47]:
# try neural network

# try alpha = 1e-5, 1e-4, ..., 1e-1
# try hidden_layer_sizes in [100], [100, 100], [100, 100, 100], [100, 100, 100, 100], [1000], [1000, 100]
def try_mlp(X, y, n_components=None):
    best_acc = 0
    best_h = None
    best_alpha = None
    for h in [[100], [100, 100], [100, 100, 100], [100, 100, 100, 100], [1000], [1000, 100]]:
        for alpha in [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]:
            accs = []
            kf = KFold(n_splits=2)
            for i, (train_index, test_index) in enumerate(kf.split(X)):  # k fold cross validation
                X_train = X[train_index]
                y_train = y[train_index]
                X_test = X[test_index]
                y_test = y[test_index]
                if n_components:  # if we use dimensionality reduction
                    pca = PCA(n_components)
                    pca.fit(X_train)
                    X_train = pca.transform(X_train)
                    X_test = pca.transform(X_test)
                    
                mlp = MLPClassifier(h, alpha=alpha)
                mlp.fit(X_train, y_train)
                acc = (mlp.predict(X_test) == y_test).sum() / len(y_test)
                accs.append(acc)
            mean_acc = np.mean(accs)
            if mean_acc > best_acc:
                print(f"got mean acc: {mean_acc} with h={h} and alpha={alpha}, when PCA = {n_components is None}")
                best_acc = mean_acc
                best_h = h
                best_alpha = alpha
    
    return best_acc, best_alpha, best_h

In [48]:
# trying KNN
def try_KNN(X, y, n_components=None):
    best_acc = 0
    for num_neigh in [1,2,4,8,16]:
        for metric in ['l1', 'l2', 'cosine']:
            best_num_neigh, best_metric = 0, None
            accs = []
            kf = KFold(n_splits=2)
            for i, (train_index, test_index) in enumerate(kf.split(X)):  # k fold cross validation
                X_train = X[train_index]
                y_train = y[train_index]
                X_test = X[test_index]
                y_test = y[test_index]

                if n_components:  # if we use dimensionality reduction
                    pca = PCA(n_components)
                    pca.fit(X_train)
                    X_train = pca.transform(X_train)
                    X_test = pca.transform(X_test)

                neigh = KNeighborsClassifier(n_neighbors=num_neigh, metric=metric)
                neigh.fit(X_train, y_train)
                acc = (neigh.predict(X_test) == y_test).sum() / len(y_test)
                accs.append(acc)
            mean_acc = np.mean(accs)
            if mean_acc > best_acc:
                print(f"got mean acc: {mean_acc} with neighbors={num_neigh} and metric={metric}, when PCA = {n_components is None}")
                best_acc = mean_acc
                best_num_neigh = num_neigh
                best_metric = metric

    return best_acc, best_C, best_kernel

In [49]:
# trying SVM
def try_SVM(X, y, n_components=None):
    for c in [1e-3, 1e-2, 1e-1, 1, 10]:
        for kernel in ['linear', 'rbf']:
            best_C, best_kernel = 0, None
            accs = []
            kf = KFold(n_splits=2)
            for i, (train_index, test_index) in enumerate(kf.split(X)):  # k fold cross validation
                X_train = X[train_index]
                y_train = y[train_index]
                X_test = X[test_index]
                y_test = y[test_index]

                if n_components:  # if we use dimensionality reduction
                    pca = PCA(n_components)
                    pca.fit(X_train)
                    X_train = pca.transform(X_train)
                    X_test = pca.transform(X_test)

                svm = SVC(C=c, kernel=kernel,)
                svm.fit(X_train, y_train)
                acc = (svm.predict(X_test) == y_test).sum() / len(y_test)
                accs.append(acc)
            mean_acc = np.mean(accs)
            if mean_acc > best_acc:
                print(f"got mean acc: {mean_acc} with C={c} and kernel={kernel}, when PCA = {n_components is None}")
                best_acc = mean_acc
                best_C = c
                best_kernel = kernel

    return best_acc, best_C, best_kernel

In [50]:
# try max_depth
# try min_samples_split with max_depth=None
def try_random_forest(X, y, n_components=None):
    max_depth = [10, 20, 30]
    min_samples_split =[2, 4, 6, 8, 10]

    best_acc, best_depth, best_samples_split = 0, 0, 0
    for depth in max_depth:
        for samples_split in min_samples_split:
            accs = []
            kf = KFold(n_splits=2)
            for i, (train_index, test_index) in enumerate(kf.split(X)):  # k fold cross validation
                X_train = X[train_index]
                y_train = y[train_index]
                X_test = X[test_index]
                y_test = y[test_index]
                if n_components:  # if we use dimensionality reduction
                    pca = PCA(n_components)
                    pca.fit(X_train)
                    X_train = pca.transform(X_train)
                    X_test = pca.transform(X_test)
                    
                rf = RandomForestClassifier(max_depth=depth, min_samples_split=samples_split, n_jobs=-1, oob_score=True)
                rf.fit(X_train, y_train)
                acc = (rf.predict(X_test) == y_test).sum() / len(y_test)
                accs.append(acc)
            mean_acc = np.mean(accs)
            if mean_acc > best_acc:
                # Record the OOB error for each `n_estimators=i` setting.
                oob_error = 1 - rf.oob_score_
                print(f"got mean acc: {mean_acc} with max depth={depth},OOB={oob_error}  and min samples split={samples_split} ,when PCA = {n_components is None}")
                best_acc = mean_acc
                best_depth = depth
                best_sapmles_split = samples_split
    
    return best_acc, best_depth, best_sapmles_split

## Confusion Matrix Based Mixture of Experts


We will now make a confusion matrix of best classifier.
Then perform class clustering using confusion matrix as class similarity, learn a classifier for each cluster - 
use two level classification at test time: first classify into classes,
then use the classifier of the cluster of the predicted class

In [51]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import confusion_matrix
from sklearn.base import BaseEstimator, ClassifierMixin


class ConfusionMatrixMixtureOfExperts:
    def __init__(self, initial_model, initial_model_params, distance_threshold):
        self.initial_model = initial_model(**initial_model_params)
        self.clusters = None
        self.class2expert_classifier = {}
        self.distance_threshold = distance_threshold
        self.label2idx = {}
        self.idx2label = {}
        self.class2cluster = None
        self.cluster2classes = {}
    
    def get_labels(self, y_train):
        for idx, label in enumerate(set(y_train)):
            self.idx2label[idx] = label
            self.label2idx[label] = idx
            
    def compute_confusion_matrix(self, X, y):
        kf = KFold(n_splits=2)
        cms = []
        for i, (train_index, test_index) in enumerate(kf.split(X)):
            X_train = X[train_index]
            y_train = y[train_index]
            X_test = X[test_index]
            y_test = y[test_index]
            self.initial_model.fit(X_train, y_train)
            pred = self.initial_model.predict(X_test)
            cm = confusion_matrix(y_test, pred)
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            cms.append(cm)
        cm = np.mean(cms, axis=0)
        return cm
    
    def choose_clf(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        best_clf = None
        best_f1 = 0
        # random forest:
        max_depth = [10, 20, 30, 40, 50]
        print("training random forest...")
        for depth in max_depth:
            rf = RandomForestClassifier(max_depth=depth, n_jobs=-1)
            rf.fit(X_train, y_train)
            f1 = f1_score(y_test, rf.predict(X_test), average='weighted')
            if f1 > best_f1:
                best_clf = rf
                best_f1 = f1
        # logistic regression
        print("training logistic regression...")
        for penalty in ['l2']:
            for c in [1e-4, 1e-3, 1e-2, 1e-1]:
                clf = LogisticRegression(penalty=penalty, C=c)
                clf.fit(X_train, y_train)
                acc = f1_score(y_test, clf.predict(X_test), average='weighted')
                if f1 > best_f1:
                    best_clf = clf
                    best_f1 = f1
        
        print(f"best with {best_f1}")
        return best_clf
    
    def fit(self, X, y):
        self.get_labels(y)
        y = np.array(list(map(self.label2idx.get, y)))
        self.initial_model.fit(X, y)
        y_pred = self.initial_model.predict(X)
        cm = confusion_matrix(y_pred, y)
        dists = 1 - cm
        cm = (cm + cm.transpose()) / 2
        self.class2cluster = AgglomerativeClustering(n_clusters=None, linkage='average', distance_threshold=self.distance_threshold,
                                                     affinity='precomputed').fit_predict(dists)
        # self.class2cluster[i] is cluster of class i
        for class_, cluster in enumerate(self.class2cluster):
            classes = self.cluster2classes.get(cluster, [])
            self.cluster2classes[cluster] = classes + [class_]
        # self.cluster2classes[c] is classes belonging to cluster c
        print(self.cluster2classes)
        print(f"found {len(set(self.class2cluster))} cluster")
        for cluster, classes in self.cluster2classes.items():
            if len(classes) > 1:
                # selecting examples with class value in clsuter
                X_cluster = X[[y_ in classes for y_ in y]]  
                y_cluster = y[[y_ in classes for y_ in y]]
                # training an expert for classes
                expert = self.choose_clf(X_cluster, y_cluster)
                print(f"fitting expert for cluster {cluster}")
                expert.fit(X_cluster, y_cluster)
                for y_ in classes:
                    self.class2expert_classifier[y_] = expert
            else:
                # the cluster contains a single class, no need for expert
                y_ = classes[0]
                self.class2expert_classifier[y_] = None
                
        return self

    def predict(self, X):
        y_pred_initial = self.initial_model.predict(X)
        y_pred_final = y_pred_initial.copy()
        
        for i, y_init in enumerate(y_pred_initial):
            # getting expert for predicted class, and predicting with expert
            expert = self.class2expert_classifier[y_init]
            if expert is not None:
                y = expert.predict([X[i]])[0]
                y_pred_final[i] = y
                
        y_pred_final = np.array(list(map(self.idx2label.get, y_pred_final)))
                
        return y_pred_final

In [52]:
X = df.drop(columns=['STAT_CAUSE_CODE', 'STAT_CAUSE_DESCR'])
y = df['STAT_CAUSE_DESCR']

X_train = np.array(X)
y_train = np.array(y)

ros = RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)

print(f"=== starting with clustering threshold: 0.9===")
cmmoe = ConfusionMatrixMixtureOfExperts(RandomForestClassifier, {"n_jobs":-1}, distance_threshold=0.9)
cmmoe.fit(X_train, y_train)

=== starting with clustering threshold: 0.9===
{12: [0], 7: [1], 11: [2], 10: [3], 9: [4], 8: [5], 3: [6], 6: [7], 5: [8], 4: [9], 1: [10], 2: [11], 0: [12]}
found 13 cluster


<__main__.ConfusionMatrixMixtureOfExperts at 0x167642fe4c0>

In [53]:
X = df.drop(columns=['STAT_CAUSE_CODE', 'STAT_CAUSE_DESCR'])
y = df['STAT_CAUSE_DESCR']

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

ros = SMOTE(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)

for th in [0.85, 0.9, 0.95, 0.97]:
    cmmoe = ConfusionMatrixMixtureOfExperts(RandomForestClassifier, {"n_jobs":-1}, distance_threshold=th)
    cmmoe.fit(X_train, y_train)
    preds = cmmoe.predict(X_test)
    print(f"=== clustering threshold: {th} with score: {f1_score(y_test, preds, average='weighted')}===")

{12: [0], 7: [1], 11: [2], 10: [3], 9: [4], 8: [5], 3: [6], 6: [7], 5: [8], 4: [9], 1: [10], 2: [11], 0: [12]}
found 13 cluster
=== clustering threshold: 0.85 with score: 0.4662751085387782===
{12: [0], 7: [1], 11: [2], 10: [3], 9: [4], 8: [5], 3: [6], 6: [7], 5: [8], 4: [9], 1: [10], 2: [11], 0: [12]}
found 13 cluster
=== clustering threshold: 0.9 with score: 0.4647967189843515===
{12: [0], 7: [1], 11: [2], 10: [3], 9: [4], 8: [5], 3: [6], 6: [7], 5: [8], 4: [9], 1: [10], 2: [11], 0: [12]}
found 13 cluster
=== clustering threshold: 0.95 with score: 0.46166642552333725===
{12: [0], 7: [1], 11: [2], 10: [3], 9: [4], 8: [5], 3: [6], 6: [7], 5: [8], 4: [9], 1: [10], 2: [11], 0: [12]}
found 13 cluster
=== clustering threshold: 0.97 with score: 0.46440699810337477===


In [54]:
X = df.drop(columns=['STAT_CAUSE_CODE', 'STAT_CAUSE_DESCR'])
y = df['STAT_CAUSE_DESCR']

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

for th in [0.85, 0.9, 0.95, 0.97]:
    cmmoe = ConfusionMatrixMixtureOfExperts(RandomForestClassifier, {"n_jobs":-1}, distance_threshold=th)
    cmmoe.fit(X_train, y_train)
    preds = cmmoe.predict(X_test)
    print(f"=== clustering threshold: {th} with score: {f1_score(y_test, preds, average='weighted')}===")

{12: [0], 7: [1], 11: [2], 10: [3], 9: [4], 8: [5], 3: [6], 6: [7], 5: [8], 4: [9], 1: [10], 2: [11], 0: [12]}
found 13 cluster
=== clustering threshold: 0.85 with score: 0.45710574678790195===
{12: [0], 7: [1], 11: [2], 10: [3], 9: [4], 8: [5], 3: [6], 6: [7], 5: [8], 4: [9], 1: [10], 2: [11], 0: [12]}
found 13 cluster
=== clustering threshold: 0.9 with score: 0.4575742626040208===
{12: [0], 7: [1], 11: [2], 10: [3], 9: [4], 8: [5], 3: [6], 6: [7], 5: [8], 4: [9], 1: [10], 2: [11], 0: [12]}
found 13 cluster
=== clustering threshold: 0.95 with score: 0.4620447394824607===
{12: [0], 7: [1], 11: [2], 10: [3], 9: [4], 8: [5], 3: [6], 6: [7], 5: [8], 4: [9], 1: [10], 2: [11], 0: [12]}
found 13 cluster
=== clustering threshold: 0.97 with score: 0.455982595159328===
