## Installing all the required libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno
from sklearn.metrics import classification_report , accuracy_score,confusion_matrix
import warnings
warnings.filterwarnings('ignore')

Installing intel ONE API scikit learn library

In [None]:
!pip install scikit-learn-intelex

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
!pip install modin

In [None]:
import modin.pandas as md

In [None]:
df =pd.read_csv('Crop_recommendation.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns = ['Nitrogen','Phosphorus','Potassium','Temperature','Humidity','pH','Rainfall','Label']

In [None]:
df.isna().sum()

In [None]:
type(df)

In [None]:
df.head()

In [None]:
df["Label"].value_counts().plot.barh()
plt.show()

In [None]:
df.describe()

In [None]:
plt.style.use('fast')
sns.set_palette("Set2")
for i in df.columns[:-1]:
    fig,ax = plt.subplots(1,3,figsize=(18,4))
    sns.histplot(data = df,x=i,kde = True,bins = 20,ax = ax[0])
    sns.violinplot(data = df,x = i,ax =ax[1])
    sns.boxplot(data = df,x = i,ax =ax[2])
    plt.suptitle(f'Visualizing {i}',size =20)

In [None]:
grouped = df.groupby(by = 'Label').mean().reset_index()
grouped

In [None]:
for i in grouped.columns[1:]:
    print(f'Top 5 Most {i} requiring crops :')
    for j,k in grouped.sort_values(by = i,ascending =False)[:5][['Label',i]].values:
        print(f'{j}-->{k}')
    print(f'********************************')

In [None]:

for i in grouped.columns[1:]:
    print(f'Top 5 Least {i} requiring crops:')
    print(f'********************************')
    for j ,k in grouped.sort_values(by=i)[:5][['Label',i]].values:
        print(f'{j} --> {k}')
    print(f'********************************')

In [None]:
df.head()

In [None]:
import time

start_time = time.time()

# Plot a subset of data using sample()
sns.catplot(data=df.sample(n=1000), x="Potassium", y="Phosphorus", hue="Label", kind="swarm")
plt.show()

# Measure execution time and display
print("Execution time for swarm plot:", time.time() - start_time, "seconds")

start_time = time.time()

# Plot specific features using pairplot()
sns.pairplot(data=df.sample(n=1000), hue='Label', vars=['Nitrogen', 'Phosphorus', 'Potassium'])
plt.show()

# Measure execution time and display
print("Execution time for pair plot:", time.time() - start_time, "seconds")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df is your DataFrame containing both numeric and non-numeric data

# Option 1: Drop non-numeric columns
numeric_df = df.select_dtypes(include=['number'])

# Option 2: Convert non-numeric columns to numeric format using one-hot encoding
# If you have categorical columns that you want to include, you can encode them
# numeric_df = pd.get_dummies(df)

# Now, plot the correlation matrix heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(numeric_df.corr(), annot=True)
plt.show()



# Data

As observed from our heat map Potassium and Phosphorus has high corelation value of 0.74


In [None]:
!pip install nbformat

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
df_pca = pca.fit_transform(df.drop(['Label'],axis =1))
df_pca = pd.DataFrame(df_pca)
fig = px.scatter(x = df_pca[0],y = df_pca[1],color = df['Label'],title = "Decomposed Using PCA")
fig.show()



In [None]:
pca3=PCA(n_components=3)
df_pca3=pca3.fit_transform(df.drop(['Label'],axis=1))
df_pca3=pd.DataFrame(df_pca3)
fig = px.scatter_3d(x=df_pca3[0],y=df_pca3[1],z=df_pca3[2],color=df['Label'],title=f"Variance Explained : {pca3.explained_variance_ratio_.sum() * 100}%")
fig.show()

In [None]:
fig = px.scatter(x=df['Nitrogen'],y=df['Phosphorus'],color=df['Label'],title="Nitrogen VS Phosphorus")
fig.show()

In [None]:
fig = px.scatter(x=df['Phosphorus'],y=df['Potassium'],color=df['Label'],title="Phosphorus VS Potassium")
fig.show()

In [None]:
names = df['Label'].unique()
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['Label']=encoder.fit_transform(df['Label'])
df.head()

In [None]:
X=df.drop(['Label'],axis=1)
y=df['Label']
#Splitting into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,shuffle = True, random_state = 42,stratify=y)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_train=pd.DataFrame(X_train,columns=X.columns)
X_train.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=42)

In [None]:
!pip install catboost

In [None]:
!pip install lightgbm

In [None]:
pip install --upgrade pandas


In [None]:
pip install xgboost

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC
from xgboost import XGBClassifier
#import lightgbm as lgb
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoostClassifier


In [None]:
pip install tqdm

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [None]:

# Import other necessary classifiers
from tqdm import tqdm

# Define your models
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVC': SVC(),
    'LinearSVC': LinearSVC(),
    'NuSVC': NuSVC(),
    'XGBoost': XGBClassifier(),
    'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
    'Gaussian Naive Bayes': GaussianNB(),
    'Bagging': BaggingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'CatBoost': CatBoostClassifier()

}

# Fit models and evaluate accuracy
def fit_and_score(models, X_train, X_test, y_train, y_test):
    np.random.seed(42)
    model_scores = {}
    # Create a tqdm progress bar with the total number of models
    progress_bar = tqdm(models.items(), desc="Fitting and scoring models", total=len(models))
    for name, model in progress_bar:
        # Fit the model
        model.fit(X_train, y_train)
        # Calculate the model score
        score = model.score(X_test, y_test)
        # Store the model score
        model_scores[name] = score
        # Update the progress bar description
        progress_bar.set_postfix({"Current model": name, "Score": score})
    return model_scores

# Usage
model_scores = fit_and_score(models, X_train, X_test, y_train, y_test)
print(model_scores)



In [None]:
type(model_scores)

In [None]:
models = {'Logistic Regression': LogisticRegression(),
         'Random Forest': RandomForestClassifier(),
         'Tree': DecisionTreeClassifier(),
         "SVC": SVC(),
          "Linear SVC":LinearSVC(C=2),
          "NU SVC":NuSVC(),
         "XGBoost": XGBClassifier(),
         "KNN":KNeighborsClassifier(n_neighbors = 5, p=2),
          #"Light GBM": lgb.LGBMClassifier(),
          "LDA":LinearDiscriminantAnalysis(),
          "Gaussian NB":GaussianNB(),
          "AdaBoost":AdaBoostClassifier(),
          "Gradient Boosting":GradientBoostingClassifier(),
          "Bagging":BaggingClassifier(),
          "Extra Trees":ExtraTreesClassifier(),
          "Cat Boost":CatBoostClassifier(verbose=False)}

def fit_and_score(models,X_train,X_test,y_train,y_test):
    np.random.seed(42)
    model_scores = {}
    for name,model in models.items():
        model.fit(X_train,y_train)
        model_scores[name] = model.score(X_test,y_test)

    return model_scores

In [None]:
def plot_dict_as_bar(dict_data, title=None):
    keys = list(dict_data.keys())
    values = list(dict_data.values())

    plt.bar(keys, values)
    plt.xlabel('Models')
    plt.ylabel('Accuracy')
    if title:
        plt.title(title)
    # Rotate x-axis labels by 45 degrees for better alignment
    plt.xticks(rotation=45, ha='right')

    plt.tight_layout()  # Adjusts the layout to prevent overlapping
    plt.show()

In [None]:
def plot_dict_as_bar(model_scores, title='Bar Plot'):
    import matplotlib.pyplot as plt

    # Extract model names and scores
    model_names = list(model_scores.keys())
    scores = list(model_scores.values())

    # Plot the bar chart
    plt.figure(figsize=(10, 6))
    plt.bar(model_names, scores, color='skyblue')
    plt.xlabel('Model')
    plt.ylabel('Accuracy')
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.ylim(0, 1)  # Adjust the y-axis limits if needed
    plt.show()

# Check if all models are included in the model_scores dictionary
print("Model Scores:", model_scores)

# Plot the bar chart
plot_dict_as_bar(model_scores, title='Bar Plot of Accuracy')


In [None]:
def cm_and_score(models,X_train,X_test,y_train,y_test):
    np.random.seed(42)
    for name,model in models.items():
        print('**************   '+ name + '   ***********')
        y_pred = model.predict(X_test)

        Acc = accuracy_score(y_pred,y_test)
        cm = confusion_matrix(y_test,y_pred,labels = [0,1])
        print('Confusion Matrix')
        sns.heatmap(cm,cmap = 'Greens',annot = True,cbar_kws = {'orientation':'horizontal'})
        plt.show()

        print(classification_report(y_test,y_pred))
        print('.:.'+ name +' Accuracy'+'\033[1m {:.3f}%'.format(Acc*100)+' .:.')
        print('      ')
        print('      ')
        print('      ')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

# Import LinearSVC
from sklearn.svm import LinearSVC

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {'Logistic Regression': LogisticRegression(),
          'Random Forest': RandomForestClassifier(),
          'DecisionTreeClassifier': DecisionTreeClassifier(),
          'SVC': SVC(),
          'Linear SVC': LinearSVC(),  # Include LinearSVC in the models
          'XGBoost': XGBClassifier(),
          'KNN': KNeighborsClassifier(n_neighbors=5, p=2),
          'LDA': LinearDiscriminantAnalysis(),
          'Gaussian NB': GaussianNB(),
          'AdaBoost': AdaBoostClassifier(),
          'Gradient Boosting': GradientBoostingClassifier(),
          'Bagging': BaggingClassifier(),
          'Extra Trees': ExtraTreesClassifier(),
          'Cat Boost': CatBoostClassifier(verbose=False)}

def cm_and_score(models, X_train, X_test, y_train, y_test):
    for name, model in models.items():
        print('**************   '+ name + '   ***********')
        model.fit(X_train, y_train)  # Fit the model with training data
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, digits=2)
        print(report)
        
        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()

        Acc = model.score(X_test, y_test)
        print(f"{name} Accuracy: {Acc*100:.2f}%")

# Call the cm_and_score function with the models and data
cm_and_score(models, X_train, X_test, y_train, y_test)


In [None]:
print(df.columns)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

# Example of actual and predicted labels
y_true = [1, 0, 1, 1, 0]  # Actual labels
y_pred = [1, 0, 1, 0, 1]  # Predicted labels

# Generate classification report
report = classification_report(y_true, y_pred)
print("Classification Report:")
print(report)

# Generate confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm)

# Plot confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
import joblib

# Save the trained models
for name, model in models.items():
    joblib.dump(model,'model.pkl')

# Save the preprocessing steps
joblib.dump(le_soil_type, 'soil_type_encoder.pkl')
joblib.dump(le_fertilizer_used, 'fertilizer_encoder.pkl')

In [None]:
import joblib

# Save the trained models with unique filenames
for name, model in models.items():
    filename = f'{name}_model.pkl'  # Unique filename for each model
    joblib.dump(model, filename)
