# 2018/19 CeNAT Model

## Project Set-Up

In [None]:
from google.colab import drive

# mount drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Scikit-learn original paper: https://jmlr.csail.mit.edu/papers/v12/pedregosa11a.html
# Scikit-learn documentation: https://scikit-learn.org/stable/

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, cross_val_score, \
                                    RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, \
                            classification_report, roc_curve, auc, \
                            roc_auc_score

# Seaborn: https://seaborn.pydata.org/index.html
# Scipy: https://docs.scipy.org/doc/scipy/
# MatplotLib: https://matplotlib.org/stable/index.html
# NumPy: https://numpy.org/doc/1.26/
# Pandas:  https://pandas.pydata.org/docs/
# GraphViz: https://graphviz.org/documentation/

import seaborn as sns
from scipy.stats import randint
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import graphviz

# read and display data
sms_filepath = '/content/drive/MyDrive/CS 131 final project/training_data/cleaned_combined.csv'
hn_filepath = '/content/drive/MyDrive/cleaned_combined.csv'

# specify your filepath!
df = pd.read_csv(hn_filepath)
df.head()

Unnamed: 0,B11,B2,B3,B4,B8,NDMI,NDVI,NDWI,SAVI,pineapple,latitude,longitude,year
0,0.3015,0.0438,0.07915,0.06085,0.3104,0.014545,0.672189,-0.593634,0.429641,0,9.728799,-85.211897,2018
1,0.0415,0.0062,0.0145,0.0087,0.1038,0.428768,0.845333,-0.754861,0.232898,0,10.017698,-83.27495,2018
2,0.0456,0.0126,0.019,0.0095,0.1473,0.527216,0.878827,-0.771497,0.314708,0,9.418162,-84.093854,2018
3,0.1271,0.02305,0.0453,0.0201,0.3301,0.444007,0.885208,-0.758657,0.54693,0,9.882322,-83.724646,2018
4,0.451,0.2554,0.3,0.3024,0.4976,0.049125,0.244,-0.247743,0.225231,0,10.006289,-83.30163,2018


## Collect Data
bands, features, coordinates

In [None]:
# get basic bands data
bands = [col for col in df.columns if col.startswith('B')]
X_bands = df[bands].to_numpy()

# get feature data
feats = ['NDMI', 'NDVI', 'NDWI', 'SAVI']
X_feats = df[feats].to_numpy()

# get coords data
coords = ['latitude', 'longitude']
X_coords = df[coords].to_numpy()

# combined data
X = np.hstack((X_bands, X_feats))

# get test data
y = df['pineapple'].to_numpy()

## Util Functions

In [None]:
"""
Analyze classifier predictions using confusion matrix, bar graph,
and ROC curve, and print out evaluation metrics.

Args:
    y_test: test data labels
    y_pred: predicted labels
    y_scores: probabilities for predicted labels
"""
def analyze_results(y_test, y_pred, y_scores):
  # print classification report
  print(classification_report(y_test, y_pred, digits=6))

  # plot analytics graphs
  plt_confusion_matrix(y_test, y_pred)
  plt_bar_graph(y_test, y_pred)
  plt_ROC_curve(y_test, y_scores)


def plt_confusion_matrix(y_true, y_pred):
  # confusion matrix
  conf_matrix = confusion_matrix(y_true, y_pred)
  labels = set(['Non-pineapple', 'Pineapple'])

  plt.figure(figsize=(10, 7))
  sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Reds', xticklabels=sorted(labels), yticklabels=sorted(labels))
  plt.xlabel('Predicted labels')
  plt.ylabel('True labels')
  plt.title('Confusion Matrix')
  plt.show()


def plt_bar_graph(y_true, y_pred):
  # set up bar graph
  unique, counts_true = np.unique(y_true, return_counts=True)
  unique, counts_pred = np.unique(y_pred, return_counts=True)

  x = np.arange(len(unique))  # the label locations
  width = 0.3                 # the width of the bars
  labels = set(['Non-pineapple', 'Pineapple'])

  fig, ax = plt.subplots()
  rects1 = ax.bar(x - width/2, counts_true, width, label='True', color='#279989')
  rects2 = ax.bar(x + width/2, counts_pred, width, label='Predicted', color='#8C1515')

  # plot bar graph
  ax.set_ylabel('Counts')
  ax.set_title('Counts by Class and Type')
  ax.set_xticks(x)
  ax.set_xticklabels(labels)
  ax.legend()
  fig.tight_layout()
  plt.show()


def plt_ROC_curve(y_test, y_scores):
  # plot ROC curve
  fpr, tpr, thresholds = roc_curve(y_test, y_scores)
  roc_auc = auc(fpr, tpr)

  plt.figure()
  plt.plot(fpr, tpr, color='#8C1515',
          lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
  plt.plot([0, 1], [0, 1], color='#279989', lw=2, linestyle='--')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.0])
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('ROC Curve')
  plt.legend(loc="lower right")
  plt.show()

# Decision Tree Models

## Decision Tree, Random Forest, Gradient Boosting

### Tune Hyperparameters

In [None]:
"""
Return random hyperparameter distribution across pre-set ranges for
specified classifier.

Args:
    Classifier: decision tree estimator being considered

Returns:
  param_dist: distribution of parameters to test
"""
def tune_hyperparams(Classifier):
  param_dist = {}

  if Classifier == DecisionTreeClassifier:
    param_dist = {'max_depth': randint(1,20),
                  'min_samples_split': randint(2, 20),
                  'min_samples_leaf': randint(1, 20),}
  elif Classifier == RandomForestClassifier:
    param_dist = {'n_estimators': randint(50, 500),
                  'max_depth': randint(1, 20),
                  'min_samples_split': randint(2, 20),
                  'min_samples_leaf': randint(1, 20),
                  'max_features': ['sqrt', 'log2', None]}
  elif Classifier == GradientBoostingClassifier:
    param_dist = {'n_estimators': randint(50, 500),
                  'max_depth': randint(1, 20),
                  'learning_rate': [0.01, 0.1, 0.2, 0.3]}

  return param_dist

In [None]:
"""
Get tuned classifier with hard-coded parameters.

Args:
    Classifier: decision tree estimator being considered
    depth: max-depth for tree model

Returns:
  TunedClassifier: tuned classifier ready to be fit to data
"""
def get_tuned_classifier(Classifier, depth):
  if Classifier == DecisionTreeClassifier:
    TunedClassifier = DecisionTreeClassifier(max_depth=depth,
                                             min_samples_split=11,
                                             min_samples_leaf=2,
                                             random_state=0)

  elif Classifier == RandomForestClassifier:
    TunedClassifier = RandomForestClassifier(max_depth=depth,
                                             max_features='log2',
                                             min_samples_leaf=10,
                                             min_samples_split=6,
                                             n_estimators=434,
                                             random_state=0)

  elif Classifier == GradientBoostingClassifier:
    TunedClassifier = GradientBoostingClassifier(max_depth=depth,
                                                 learning_rate=0.2,
                                                 n_estimators=458,
                                                 random_state=0)

  return TunedClassifier

In [None]:
"""
Find best depth to fit decision tree to.

Args:
    X: feature data
    y: datapoint labels
    Classifier: decision tree estimator being considered

Returns:
    best_depth: depth of decision tree classifier that maximizes accuracy
"""
def tune_max_depth(X, y, Classifier):
  # tune max-depth parameter
  max_depths = range(1, 21)
  cv_scores = []

  # train/test over possible depths
  for depth in max_depths:
      # clf = get_tuned_classifier(Classifier, depth)
      clf = Classifier(max_depth=depth, random_state=0)
      clf.fit(X, y)
      scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
      cv_scores.append(np.mean(scores))

  # plot performance across depths
  plot_depth_tune(max_depths, cv_scores)

  # find best depth
  best_depth = max_depths[np.argmax(cv_scores)]
  print(f"\nBest max depth: {best_depth}")

  return best_depth



"""
Plot performance across range of possible DT depths.

Args:
    max_depths: range of maximum depths considered
    cv_scores: cross validation scores (accuracies)
"""
def plot_depth_tune(max_depths, cv_scores):
  # plot performance across depths
  plt.figure(figsize=(10, 6))
  plt.plot(max_depths, cv_scores, marker='o', color='#8C1515')
  plt.xlabel('Max Depth')
  plt.ylabel('Average Cross-Validation Score')
  plt.title('Classifier Performance at Different Max Depths')
  plt.grid(True)
  plt.xticks(max_depths)
  plt.show()

### Train and Test Classifiers

In [None]:
"""
Build, train, amd test classifier derived from best model produced by
Randomized Search Cross Validation for tuning various hyperparameters.

Args:
    X: feature data
    y: datapoint labels
    Classifier: decision tree estimator being considered

Returns:
    y_test: test data labels
    y_pred: predicted labels
    y_scores: probabilities for predicted labels
"""
def decision_tree_classifier_param_tuning(X, y, Classifier):
  # 80-20 split data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

  # tune hyperparameters and build classifier
  param_dist = tune_hyperparams(Classifier)
  dt = RandomizedSearchCV(Classifier(),
                          param_distributions = param_dist,
                          n_iter=10,
                          cv=10,
                          scoring='accuracy',
                          random_state=0)

  # fit classifier to data
  dt.fit(X_train, y_train)

  # best parameter set
  print("Best parameters found: ", dt.best_params_)
  print("Best accuracy found: ", dt.best_score_)
  best_dt = dt.best_estimator_

  # get probability of data point being pineapple
  y_scores = best_dt.predict_proba(X_test)[:, 1]

  # predict labels of test set
  y_pred = best_dt.predict(X_test)

  # display accuracy of classifier
  accuracy = accuracy_score(y_test, y_pred)
  # print(f"\nAccuracy with max depth of {best_depth}: {accuracy}\n")
  print(f"\nAccuracy: {accuracy}\n")

  return y_test, y_pred, y_scores

In [None]:
"""
Build, train, amd test classifier derived from best model produced by
tuning max-depth parameter alone.

Args:
    X: feature data
    y: datapoint labels
    Classifier: decision tree estimator being considered

Returns:
    y_test: test data labels
    y_pred: predicted labels
    y_scores: probabilities for predicted labels
"""
def decision_tree_classifier_max_depth(X, y, Classifier):
  # 80-20 split data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

  # find best depth and build classifier
  best_depth = tune_max_depth(X, y, Classifier)
  dt = get_tuned_classifier(Classifier, best_depth)

  # fit classifier to data
  dt.fit(X_train, y_train)

  # get probability of data point being pineapple
  y_scores = dt.predict_proba(X_test)[:, 1]

  # predict labels of test set
  y_pred = dt.predict(X_test)

  # display accuracy of classifier
  accuracy = accuracy_score(y_test, y_pred)
  # print(f"\nAccuracy with max depth of {best_depth}: {accuracy}\n")
  print(f"\nAccuracy: {accuracy}\n")

  return y_test, y_pred, y_scores

**Runtimes on Complete Data (Max-Depth)**
- Decision Tree: 12s, max-depth of 10 for 85.3% accuracy
- Random Forest: 4m18s, max-depth of 19 for 87.9% accuracy
- Gradient Boosting: 21m6s, max-depth of 7 for 87.5% accuracy

**Runtimes on Complete Data (Generalized Tuning)**
- Decision Tree: 7s, 86.2% accuracy
  - best parameters found:  {'max_depth': 10, 'min_samples_leaf': 11, 'min_samples_split': 2}
- Random Forest: 12m22s, 88.1% accuracy
  - best parameters found:  {'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 10, 'min_samples_split': 6, 'n_estimators': 434}
- Gradient Boosting: 45m44s, 87.35308% accuracy
  - best parameters found:  {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 458}


In [None]:
sklearn_classifiers = {"Decision Tree": DecisionTreeClassifier,
                       "Random Forest": RandomForestClassifier,
                       "Gradient Boosting": GradientBoostingClassifier}

# change this to change model type!
classifier_type = "Decision Tree"
Classifier = sklearn_classifiers[classifier_type]

# print("\n\n====================================================")
# print(f"Running {classifier_type} classifier on bands data...")
# print("====================================================\n")
# bands_test, bands_pred, bands_scores = decision_tree_classifier_param_tuning(X_bands, y, Classifier)
# bands_test, bands_pred, bands_scores = decision_tree_classifier_max_depth(X_bands, y, Classifier)

# print("\n\n====================================================")
# print(f"Running {classifier_type} classifier on features data...")
# print("====================================================\n")
# feats_test, feats_pred, feats_scores = decision_tree_classifier_param_tuning(X_feats, y, Classifier)
# feats_test, feats_pred, feats_scores = decision_tree_classifier_max_depth(X_feats, y, Classifier)

print("\n\n====================================================")
print(f"Running {classifier_type} classifier on complete data...")
print("====================================================\n")
# complete_test, complete_pred, complete_scores = decision_tree_classifier_param_tuning(X, y, Classifier)
complete_test, complete_pred, complete_scores = decision_tree_classifier_max_depth(X, y, Classifier)

### Analyze Results

In [None]:
# analyze_results(bands_test, bands_pred, bands_scores)
# analyze_results(feats_test, feats_pred, feats_scores)

print("\n\n====================================================")
print(f"Analyzing {classifier_type} classifier on complete data...")
print("====================================================\n")
analyze_results(complete_test, complete_pred, complete_scores)

# Non-Decision Tree Models

## Logistic Regression, K-Nearest Neighbors

### Train and Classify Classifiers

In [None]:
"""
Build, train, and test non-tree based classifier (either KNN or
Logistic Regression).

Args:
    X: feature data
    y: datapoint labels
    Classifier: decision tree estimator being considered

Returns:
    y_test: test data labels
    y_pred: predicted labels
    y_scores: probabilities for predicted labels
"""
def non_decision_tree_classifier(X, y, Classifier):
  # 80-20 split data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

  # create a logistic regression classifier at best depth
  ndt = Classifier()

  # fit classifier to data
  ndt.fit(X_train, y_train)

  # get probability of data point being pineapple
  y_scores = ndt.predict_proba(X_test)[:, 1]

  # predict labels of test set
  y_pred = ndt.predict(X_test)

  # display accuracy of classifier
  accuracy = accuracy_score(y_test, y_pred)
  print(f"\nAccuracy: {accuracy}\n")

  return y_test, y_pred, y_scores

**Performance on Complete Data**

- Logistic Regression: 0s, 86.69488% accuracy
- K-Nearest Neighbors: 0s, 86.60085% accuracy

In [None]:
sklearn_classifiers = {"Logistic Regression": LogisticRegression,
                       "K-Nearest Neighbors": KNeighborsClassifier}

# change this to change model type!
classifier_type = "Logistic Regression"
Classifier = sklearn_classifiers[classifier_type]

# print("\n\n====================================================")
# print(f"Running {classifier_type} classifier on bands data...")
# print("====================================================\n")
# bands_test, bands_pred, bands_scores = non_decision_tree_classifier(X_bands, y, Classifier)

# print("\n\n====================================================")
# print(f"Running {classifier_type} classifier on features data...")
# print("====================================================\n")
# feats_test, feats_pred, feats_scores = non_decision_tree_classifier(X_feats, y, Classifier)

print("\n\n====================================================")
print(f"Running {classifier_type} classifier on complete data...")
print("====================================================\n")
complete_test, complete_pred, complete_scores = non_decision_tree_classifier(X, y, Classifier)

### Analyze Results

In [None]:
# analyze_results(bands_test, bands_pred, bands_scores)
# analyze_results(feats_test, feats_pred, feats_scores)

print("\n\n====================================================")
print(f"Analyzing {classifier_type} classifier on complete data...")
print("====================================================\n")
analyze_results(complete_test, complete_pred, complete_scores)