<a href="https://colab.research.google.com/github/sanjayattelli29/AQI-ML-Streamlit/blob/master/Python%20code%20for%20training%20a%20model%20using%20an%20expanded%20CSV%20dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Load the file
from google.colab import files
import io

uploaded = files.upload()
file_name = list(uploaded.keys())[0]

df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Determine the target number of rows
target_rows = len(df) * 6

# Function to generate nearby values
def generate_nearby_values(original_df, target_size):
    augmented_data = []

    while len(augmented_data) + len(original_df) < target_size:
        for _, row in original_df.iterrows():
            new_row = row.copy()
            for col in original_df.select_dtypes(include=[np.number]).columns:
                variation = np.random.uniform(-0.05, 0.05) * row[col]
                new_row[col] = row[col] + variation
            augmented_data.append(new_row)

            if len(augmented_data) + len(original_df) >= target_size:
                break

    return pd.concat([original_df, pd.DataFrame(augmented_data)], ignore_index=True)

# Generate new data
expanded_df = generate_nearby_values(df, target_rows)

# Save the new CSV in Colab
output_path = "expanded_agriculture_data.csv"
expanded_df.to_csv(output_path, index=False)

print(f"Expanded dataset saved as '{output_path}', you can now download it.")

# Provide download link
from google.colab import files
files.download(output_path)


Saving agriculture data to see which crop to cultivate.csv to agriculture data to see which crop to cultivate (1).csv
Expanded dataset saved as 'expanded_agriculture_data.csv', you can now download it.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np
import pickle
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from flask import Flask, request, jsonify
from google.colab import files
import io
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss, mean_absolute_error, mean_squared_error, r2_score, cohen_kappa_score, balanced_accuracy_score, hinge_loss
from scipy.stats import chi2_contingency

# Upload file
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# Load dataset
data = pd.read_csv(io.BytesIO(uploaded[filename]))

# Extract features and target
X = data[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']]
y = data['label']

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Save label encoder
joblib.dump(label_encoder, 'label_encoder.pkl')

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, 'scaler.pkl')

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Save test data
pd.DataFrame(X_test, columns=X.columns).to_csv("X_test.csv", index=False)
pd.DataFrame(y_test, columns=["label"]).to_csv("y_test.csv", index=False)

# Train models
models = {
    "Naive Bayes": GaussianNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(probability=True, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

trained_models = {}
model_metrics = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    f1 = f1_score(y_test, y_pred, average="weighted")
    roc_auc = roc_auc_score(y_test, y_proba, multi_class="ovr") if y_proba is not None else None
    logloss = log_loss(y_test, y_proba) if y_proba is not None else None
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    hinge = hinge_loss(y_test, y_proba) if y_proba is not None else None
    chi_square_stat, _, _, _ = chi2_contingency(pd.crosstab(y_test, y_pred))
    gini = 2 * roc_auc - 1 if roc_auc is not None else None

    model_metrics[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "ROC-AUC": roc_auc,
        "Log Loss": logloss,
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R² Score": r2,
        "Cohen’s Kappa": kappa,
        "Balanced Accuracy": balanced_acc,
        "Hinge Loss": hinge,
        "Gini Coefficient": gini,
        "Chi-Square (Z²)": chi_square_stat
    }

    model_filename = f'{name.lower().replace(" ", "_")}_model.pkl'
    with open(model_filename, 'wb') as f:
        pickle.dump(model, f)
    trained_models[name] = model_filename

    print(f'{name} Model Accuracy: {accuracy * 100:.2f}%')

print('✅ All models trained and saved.')

# Save model metrics to CSV
metrics_df = pd.DataFrame(model_metrics).T
metrics_df.to_csv("model_performance_metrics.csv", index=True)

# Save model metrics
joblib.dump(model_metrics, "model_metrics.pkl")

# Load trained models
loaded_models = {name: pickle.load(open(filename, 'rb')) for name, filename in trained_models.items()}

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.json
        model_name = data.get('model', 'Random Forest')
        features = np.array([[data['N'], data['P'], data['K'], data['temperature'], data['humidity'], data['ph'], data['rainfall']]])
        features = scaler.transform(features)

        if model_name not in loaded_models:
            return jsonify({'error': 'Invalid model name. Choose from: Naive Bayes, KNN, SVM, Random Forest'})
        prediction = loaded_models[model_name].predict(features)
        predicted_label = label_encoder.inverse_transform(prediction)[0]
        return jsonify({'recommended_crop': predicted_label, 'model_used': model_name})
    except Exception as e:
        return jsonify({'error': str(e)})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


Saving expanded_agriculture_data.csv to expanded_agriculture_data.csv
Naive Bayes Model Accuracy: 99.09%
KNN Model Accuracy: 99.17%
SVM Model Accuracy: 98.79%
Random Forest Model Accuracy: 99.39%
✅ All models trained and saved.
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
