In [None]:
#Import necessary libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from google.colab import files
import zipfile
from google.colab import files
import os

In [None]:
# Install the Kaggle API library and its dependencies.
!pip install kaggle --upgrade --quiet

In [None]:
# Upload your kaggle.json file
print("Please upload your kaggle.json API file.")
files.upload()

In [None]:
# Set up the Kaggle API client
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
DATASET_API_COMMAND = "kaggle datasets download varshitanalluri/crop-recommendation-dataset"
# The name of the zip file to expect.
ZIP_FILE_NAME = "crop-recommendation-dataset.zip"
# The directory to extract the files into.
EXTRACTION_DIR = "./CRD-dataset"

In [None]:
print("\nDownloading dataset...")
!$DATASET_API_COMMAND

In [None]:
#Unzip the dataset into the specified directory
if os.path.exists(ZIP_FILE_NAME):
    os.makedirs(EXTRACTION_DIR, exist_ok=True)
    with zipfile.ZipFile(ZIP_FILE_NAME, 'r') as zip_ref:
        zip_ref.extractall(EXTRACTION_DIR)
    print(f"Extraction of {ZIP_FILE_NAME} completed to {EXTRACTION_DIR}/")
    os.remove(ZIP_FILE_NAME)
else:
    print(f"Error: {ZIP_FILE_NAME} not found. Please check the dataset API command.")

In [None]:
#Verify the files are extracted
print("\nExtracted files are available at:", EXTRACTION_DIR)
print("Listing contents of the extraction directory:")
!ls -F {EXTRACTION_DIR}

In [None]:
# The EXTRACTION_DIR is already defined and populated by the first cell.
# Define the file paths for each CSV file.
crd_dataset_file_path = os.path.join(EXTRACTION_DIR, 'Crop_Recommendation.csv')

In [None]:
#Read the dataset using pandas.
data = pd.read_csv(crd_dataset_file_path)
data.head()

In [None]:
#Get the types of crops and their values to know if there is any class imbalance or not.
pd.unique(data['Crop'])
pd.value_counts(data['Crop'])

In [None]:
#Drop the target coloum to form a feature dataset.
x,y = data.iloc[:,:-1], data['Crop']
x.head()

In [None]:
#Draw a correlation heatmap to figure out multi-colinearity.
x_corr = x.corr()
sns.heatmap(x_corr, annot=True, cmap='coolwarm')

In [None]:
#Drop the coloumn of Phosphorous as it is highly correlated with Potassium.
x = x.drop(columns = "Phosphorus")
x.head()

In [None]:
#Standardize the dataset values.
scaler = StandardScaler()
scaled_data = scaler.fit_transform(x.values)
x_scaled = pd.DataFrame(scaled_data, columns=x.columns)
x_scaled.head()

In [None]:
#Model selection.
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

In [None]:
#Split data for training and testing.
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=42)
#Parameter grid for calculation of best parameters.
param_grid = {'criterion': ['gini', 'entropy'],
              'max_depth': [None, 5, 10],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              }

In [None]:
#Apply parameter grid on the model to asses acuracy on different parametrs.
clf = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, cv = 5, scoring = 'accuracy')
grid_search.fit(x_train, y_train)
results = pd.DataFrame(grid_search.cv_results_)
results.head()

In [None]:
#Select the best parameters.
best_param = grid_search.best_params_
print('Best Hyperparameter:', best_param)

In [None]:
#Training and testing the DecisionTreeClassifier model with best parameters.
best_clf = DecisionTreeClassifier(**best_param)
best_clf.fit(x_train, y_train)
y_pred = best_clf.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

In [None]:
#Import GradientBoosting from xgboost library.
from xgboost import XGBClassifier
#One hot encoding of the target data.
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
#Split the data for training and testing.
x_train, x_test, y_trainxg, y_testxg = train_test_split(x_scaled, y_encoded, test_size=0.2, random_state=42)
#Parameter grid for calculation of best parameters.
param_gridxg = {'max_depth': [3, 5, 7],
              'learning_rate': [0.1,0.01,0.001],
              'n_estimators': [100,200,300]
}

In [None]:
#Apply parameter grid on the model to asses acuracy on different parametrs.
grid_searchXG = GridSearchCV(estimator = XGBClassifier(), param_grid = param_gridxg, cv = 5, scoring = 'accuracy')
grid_searchXG.fit(x_train, y_trainxg)
resultsxg = pd.DataFrame(grid_searchXG.cv_results_)
resultsxg.head()

In [None]:
#Select the best parameters.
best_paramsxg = grid_searchXG.best_params_
print('Best Hyperparameter:', best_paramsxg)

In [None]:
#Train and test test the GradientBoosting model with best parameters.
best_xgb_clf = XGBClassifier(**best_paramsxg)
best_xgb_clf.fit(x_train, y_trainxg)
y_predxg = best_xgb_clf.predict(x_test)
accuracyxg = accuracy_score(y_testxg, y_predxg)
print('Accuracy:', accuracyxg)