In [1]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import cycle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder, LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc


# --- 1. Load Data ---
print("Loading data...")

features_to_load = [
    'common_name', 
    'ward_name', 
    'ownership', 
    'girth_cm', 
    'height_m', 
    'canopy_dia_m', 
    'condition'
]

numerical_features = ['girth_cm', 'height_m', 'canopy_dia_m']
categorical_features = ['common_name', 'ward_name', 'ownership']

try:
    df = pd.read_csv('D:\EDUNet\p1-1.csv', usecols=features_to_load)
except FileNotFoundError:
    print("ERROR: tree_data.csv not found. Make sure it's in the same folder.")
    exit()
except ValueError as e:
    print(f"ERROR: A required column is missing from tree_data.csv. {e}")
    exit()




  df = pd.read_csv('D:\EDUNet\p1-1.csv', usecols=features_to_load)


Loading data...


In [2]:
# --- 2. Data Validation & Pre-Cleaning ---
print("\n--- Data Validation Report ---")

print("1. Checking Data Types (Original):")
print(df.dtypes)

for col in numerical_features:
    if df[col].dtype == 'object':
        print(f"Warning: Column '{col}' is 'object', converting to numeric.")
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
for col in categorical_features:
    df[col] = df[col].astype(str)

print("\nData types after conversion:")
print(df.dtypes)

print("\n2. Checking for Null Values (Before Cleaning):")
print(df.isnull().sum())
print("-------------------------------\n")

print("Cleaning data...")
print(f"Initial row count: {len(df)}")

df = df.dropna(subset=['condition'])
df = df.dropna(subset=['common_name'])
df[categorical_features] = df[categorical_features].replace('nan', np.nan)

print(f"Row count after dropping missing labels: {len(df)}")

if len(df) == 0:
    print("ERROR: No data left after cleaning. Please check your tree_data.csv for valid 'condition' and 'common_name' fields.")
    exit()

X = df.drop('condition', axis=1)
y = df['condition']

CLASS_NAMES = np.sort(y.unique())

print("Creating train-test split (80/20)...")
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2,
        random_state=42, 
        stratify=y
    )
except ValueError:
    print("Warning: Not enough samples for a stratified split. Using a non-stratified split.")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.2,
        random_state=42
    )
print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")





--- Data Validation Report ---
1. Checking Data Types (Original):
girth_cm        float64
height_m        float64
canopy_dia_m    float64
condition        object
ownership        object
ward_name       float64
common_name      object
dtype: object

Data types after conversion:
girth_cm        float64
height_m        float64
canopy_dia_m    float64
condition        object
ownership        object
ward_name        object
common_name      object
dtype: object

2. Checking for Null Values (Before Cleaning):
girth_cm        1
height_m        1
canopy_dia_m    1
condition       1
ownership       0
ward_name       0
common_name     0
dtype: int64
-------------------------------

Cleaning data...
Initial row count: 1000001
Row count after dropping missing labels: 1000000
Creating train-test split (80/20)...
Training samples: 800000, Testing samples: 200000


In [3]:
# --- 3. Define Preprocessing Pipelines ---
print("Defining preprocessing pipelines...")

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])



Defining preprocessing pipelines...


In [4]:
# --- 4. Create the Full Preprocessor ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])



In [5]:
# --- 5. Define Models to Compare ---
print("Defining models to compare...")

models_to_compare = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42, max_depth=20),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

model_results = {}
best_model_pipeline = None 
best_model_accuracy = 0.0



Defining models to compare...


In [6]:
# --- 6. Compare Models using Train-Test Split ---
print("\n--- Model Comparison (using 80/20 split) ---")
    
for name, model in models_to_compare.items():
    print(f"Testing model: {name}...")
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    try:
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        mean_accuracy = accuracy_score(y_test, y_pred)
        model_results[name] = mean_accuracy
        
        print(f"{name}: Test Accuracy = {mean_accuracy*100:.2f}%")
        
        if mean_accuracy > best_model_accuracy:
            best_model_accuracy = mean_accuracy
            best_model_pipeline = pipeline 
        
    except Exception as e:
        print(f"Could not evaluate model {name}. Error: {e}")
        model_results[name] = 0.0

print("------------------------")




--- Model Comparison (using 80/20 split) ---
Testing model: Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression: Test Accuracy = 91.72%
Testing model: Decision Tree...
Decision Tree: Test Accuracy = 93.64%
Testing model: Random Forest...
Random Forest: Test Accuracy = 94.08%
Testing model: Gradient Boosting...
Gradient Boosting: Test Accuracy = 93.84%
------------------------


In [7]:
# --- 7. Find and Evaluate the Best Model ---
best_model_name = max(model_results, key=model_results.get)

if best_model_pipeline is None:
     print("ERROR: All models failed to train. Exiting.")
     exit()

print(f"\n--- Detailed Report for Best Model: {best_model_name} ---")
print(f"Test Accuracy: {best_model_accuracy*100:.2f}%")

y_pred = best_model_pipeline.predict(X_test)
y_prob = best_model_pipeline.predict_proba(X_test)

print("\n1. Classification Report:")
print(classification_report(y_test, y_pred, labels=CLASS_NAMES))

print("2. Generating Confusion Matrix (saving to confusion_matrix.png)...")
try:
    cm = confusion_matrix(y_test, y_pred, labels=CLASS_NAMES)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=CLASS_NAMES, yticklabels=CLASS_NAMES)
    plt.title(f'Confusion Matrix for {best_model_name}')
    plt.ylabel('Actual Condition')
    plt.xlabel('Predicted Condition')
    plt.savefig('confusion_matrix.png')
    plt.close()
    print("Saved confusion_matrix.png")
except Exception as e:
    print(f"Error generating confusion matrix: {e}")

print("3. Generating ROC/AUC Curves (saving to roc_auc_curves.png)...")
try:
    lb = LabelBinarizer()
    lb.fit(CLASS_NAMES)
    y_test_binarized = lb.transform(y_test)
    n_classes = len(CLASS_NAMES)

    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    fpr["micro"], tpr["micro"], _ = roc_curve(y_test_binarized.ravel(), y_prob.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # Plot
    plt.figure(figsize=(12, 8))
    plt.plot(fpr["micro"], tpr["micro"],
             label=f'Micro-average ROC curve (area = {roc_auc["micro"]:0.2f})',
             color='deeppink', linestyle=':', linewidth=4)

    # Plot ROC for each class
    colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red', 'purple'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label=f'ROC curve of class {CLASS_NAMES[i]} (area = {roc_auc[i]:0.2f})')

    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Multi-Class ROC/AUC Curves for {best_model_name}')
    plt.legend(loc="lower right")
    plt.savefig('roc_auc_curves.png')
    plt.close()
    print("Saved roc_auc_curves.png")

except Exception as e:
    print(f"Error generating ROC/AUC curves: {e}")
    if (y_prob.shape[1] != len(CLASS_NAMES)):
        print(f"Debug Info: y_prob shape {y_prob.shape[1]} != n_classes {len(CLASS_NAMES)}")





--- Detailed Report for Best Model: Random Forest ---
Test Accuracy: 94.08%

1. Classification Report:
              precision    recall  f1-score   support

     Average       0.65      0.46      0.54     13223
        Dead       0.84      0.83      0.83      4285
     Healthy       0.96      0.98      0.97    181771
        Poor       0.19      0.03      0.06       721

    accuracy                           0.94    200000
   macro avg       0.66      0.58      0.60    200000
weighted avg       0.93      0.94      0.94    200000

2. Generating Confusion Matrix (saving to confusion_matrix.png)...
Saved confusion_matrix.png
3. Generating ROC/AUC Curves (saving to roc_auc_curves.png)...
Saved roc_auc_curves.png


In [8]:
# --- 8. Train and Save the FINAL Model ---
print(f"\nTraining the final best model ({best_model_name}) on ALL data (100%)...")

final_classifier = models_to_compare[best_model_name]

final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', final_classifier)
])

final_pipeline.fit(X, y)
print("Final model training complete.")

pipeline_filename = 'tree_model_pipeline.pkl'
with open(pipeline_filename, 'wb') as file:
    pickle.dump(final_pipeline, file)

print(f"Best model pipeline ({best_model_name}) saved to {pipeline_filename}")


Training the final best model (Random Forest) on ALL data (100%)...
Final model training complete.
Best model pipeline (Random Forest) saved to tree_model_pipeline.pkl
