In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [2]:
# Load the data
data = pd.read_csv('/kaggle/input/plant-growth-data-classification/plant_growth_data.csv')

In [3]:
# Preprocessing
le = LabelEncoder()
data['Soil_Type'] = le.fit_transform(data['Soil_Type'])
data['Water_Frequency'] = le.fit_transform(data['Water_Frequency'])
data['Fertilizer_Type'] = le.fit_transform(data['Fertilizer_Type'])

In [4]:
# Split the data
X = data.drop('Growth_Milestone', axis=1)
y = data['Growth_Milestone']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data into DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [8]:
# Define the hyperparameter search space
space = {
    'max_depth': hp.quniform("max_depth", 3, 18, 1),
    'gamma': hp.uniform('gamma', 1, 9),
    'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10, 1),
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)
}

def objective(space):
    params = {
        'max_depth': int(space['max_depth']),
        'gamma': space['gamma'],
        'reg_alpha': int(space['reg_alpha']),
        'reg_lambda': space['reg_lambda'],
        'colsample_bytree': space['colsample_bytree'],
        'min_child_weight': int(space['min_child_weight']),
        'n_estimators': int(space['n_estimators']),
        'learning_rate': space['learning_rate'],
        'objective': 'binary:logistic',
        'tree_method': 'hist',  # Changed from 'gpu_hist' to 'hist'
        'device': 'cuda'  # Specify 'cuda' to use GPU
    }
    
    cv_result = xgb.cv(params, dtrain, num_boost_round=1000, nfold=3, 
                       metrics=['error'], early_stopping_rounds=50, seed=42)
    
    return {'loss': cv_result['test-error-mean'].min(), 'status': STATUS_OK}

In [9]:
# Run hyperparameter optimization
trials = Trials()
best_hyperparams = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=100,
                        trials=trials)

print("Best hyperparameters:", best_hyperparams)

# Train the final model with best hyperparameters
best_params = {
    'max_depth': int(best_hyperparams['max_depth']),
    'gamma': best_hyperparams['gamma'],
    'reg_alpha': int(best_hyperparams['reg_alpha']),
    'reg_lambda': best_hyperparams['reg_lambda'],
    'colsample_bytree': best_hyperparams['colsample_bytree'],
    'min_child_weight': int(best_hyperparams['min_child_weight']),
    'n_estimators': int(best_hyperparams['n_estimators']),
    'learning_rate': best_hyperparams['learning_rate'],
    'objective': 'binary:logistic',
    'tree_method': 'hist',  # Changed from 'gpu_hist' to 'hist'
    'device': 'cuda'  # Specify 'cuda' to use GPU
}

model = xgb.train(best_params, dtrain)


  1%|          | 1/100 [00:00<00:17,  5.58trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




  4%|▍         | 4/100 [00:00<00:17,  5.64trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 10%|█         | 10/100 [00:01<00:15,  5.67trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 15%|█▌        | 15/100 [00:02<00:15,  5.58trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 21%|██        | 21/100 [00:03<00:14,  5.47trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 26%|██▌       | 26/100 [00:04<00:14,  5.12trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 31%|███       | 31/100 [00:05<00:13,  5.13trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 36%|███▌      | 36/100 [00:06<00:12,  5.13trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 41%|████      | 41/100 [00:07<00:11,  4.96trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 46%|████▌     | 46/100 [00:08<00:10,  5.03trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 51%|█████     | 51/100 [00:09<00:09,  5.05trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 56%|█████▌    | 56/100 [00:10<00:08,  5.00trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 61%|██████    | 61/100 [00:11<00:07,  4.98trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 66%|██████▌   | 66/100 [00:12<00:06,  4.97trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 71%|███████   | 71/100 [00:13<00:05,  5.00trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 76%|███████▌  | 76/100 [00:14<00:04,  4.89trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 81%|████████  | 81/100 [00:15<00:03,  4.76trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 86%|████████▌ | 86/100 [00:16<00:02,  4.89trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 90%|█████████ | 90/100 [00:17<00:02,  4.77trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




 95%|█████████▌| 95/100 [00:18<00:01,  4.78trial/s, best loss: 0.5451231774761186]

Parameters: { "n_estimators" } are not used.




100%|██████████| 100/100 [00:19<00:00,  5.07trial/s, best loss: 0.5451231774761186]
Best hyperparameters: {'colsample_bytree': 0.5367557121732325, 'gamma': 4.619419723903299, 'learning_rate': 0.0608180667583487, 'max_depth': 15.0, 'min_child_weight': 3.0, 'n_estimators': 353.0, 'reg_alpha': 169.0, 'reg_lambda': 0.17699361203500708}


Parameters: { "n_estimators" } are not used.




In [10]:
# Make predictions
y_pred = model.predict(dtest)
y_pred_binary = [1 if y > 0.5 else 0 for y in y_pred]

In [11]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_binary)
print(f"Accuracy: {accuracy}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_binary))

# Feature importance
importance = model.get_score(importance_type='weight')
importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)
print("\nFeature Importance:")
for feature, score in importance:
    print(f"{feature}: {score}")

Accuracy: 0.4358974358974359

Classification Report:
              precision    recall  f1-score   support

           0       0.44      1.00      0.61        17
           1       0.00      0.00      0.00        22

    accuracy                           0.44        39
   macro avg       0.22      0.50      0.30        39
weighted avg       0.19      0.44      0.26        39


Feature Importance:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
import pickle

# Save the model as a pickle file
with open('xgboost_gpu_plant_growth_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("\nModel saved as 'xgboost_gpu_plant_growth_model.pkl'")



Model saved as 'xgboost_gpu_plant_growth_model.pkl'


In [12]:
# Save the model
model.save_model('xgboost_gpu_plant_growth_model.json')
print("\nModel saved as 'xgboost_gpu_plant_growth_model.json'")


Model saved as 'xgboost_gpu_plant_growth_model.json'


# Inference

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Load the data
data = pd.read_csv('/kaggle/input/plant-growth-data-classification/plant_growth_data.csv')

# Preprocessing
le_dict = {}
for col in ['Soil_Type', 'Water_Frequency', 'Fertilizer_Type']:
    le_dict[col] = LabelEncoder()
    data[col] = le_dict[col].fit_transform(data[col])

# Split the data
X = data.drop('Growth_Milestone', axis=1)
y = data['Growth_Milestone']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


def predict_single_datapoint(model, single_data, le_dict):
    # Create a DataFrame from the single data point
    df = pd.DataFrame([single_data])
    
    # Transform categorical variables
    for col, le in le_dict.items():
        if col in df.columns:
            try:
                df[col] = le.transform(df[col])
            except ValueError:
                print(f"Warning: Unseen label in {col}. Using -1 as a placeholder.")
                df[col] = -1  # Use -1 for unseen categories
    
    # Convert to DMatrix
    dsingle = xgb.DMatrix(df)
    
    # Make prediction
    prediction = model.predict(dsingle)[0]
    
    # Convert to binary prediction
    binary_prediction = 1 if prediction > 0.5 else 0
    
    return prediction, binary_prediction

# Example of a single data point
single_data_point = {
    'Soil_Type': 'clay',
    'Sunlight_Hours': 6.5,
    'Water_Frequency': 'daily',
    'Fertilizer_Type': 'chemical',
    'Temperature': 25.0,
    'Humidity': 60.0
}

# Make prediction on the single data point
prob_prediction, binary_prediction = predict_single_datapoint(model, single_data_point, le_dict)

print("\nPrediction for single data point:")
print(f"Probability: {prob_prediction:.4f}")
print(f"Binary Prediction: {binary_prediction}")
print(f"Predicted Growth Milestone: {'Reached' if binary_prediction == 1 else 'Not Reached'}")

# Function to get user input for a single data point
def get_user_input():
    print("\nEnter values for a single plant:")
    soil_type = input("Soil Type (e.g., clay, sandy): ")
    sunlight_hours = float(input("Sunlight Hours: "))
    water_frequency = input("Water Frequency (e.g., daily, weekly): ")
    fertilizer_type = input("Fertilizer Type (e.g., chemical, organic): ")
    temperature = float(input("Temperature: "))
    humidity = float(input("Humidity: "))

    return {
        'Soil_Type': soil_type,
        'Sunlight_Hours': sunlight_hours,
        'Water_Frequency': water_frequency,
        'Fertilizer_Type': fertilizer_type,
        'Temperature': temperature,
        'Humidity': humidity
    }

# Get user input and make prediction
user_data_point = get_user_input()
user_prob_prediction, user_binary_prediction = predict_single_datapoint(model, user_data_point, le_dict)

print("\nPrediction for user input data point:")
print(f"Probability: {user_prob_prediction:.4f}")
print(f"Binary Prediction: {user_binary_prediction}")
print(f"Predicted Growth Milestone: {'Reached' if user_binary_prediction == 1 else 'Not Reached'}")


Prediction for single data point:
Probability: 0.4805
Binary Prediction: 0
Predicted Growth Milestone: Not Reached

Enter values for a single plant:


Soil Type (e.g., clay, sandy):  clay
Sunlight Hours:  8
Water Frequency (e.g., daily, weekly):  daily
Fertilizer Type (e.g., chemical, organic):  organic
Temperature:  32
Humidity:  63



Prediction for user input data point:
Probability: 0.4805
Binary Prediction: 0
Predicted Growth Milestone: Not Reached
