In [226]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

%matplotlib inline

In [227]:
df = pd.read_csv('/Users/rudrasawant/Downloads/Disease Prediction/data/yield.csv')
df.head()

Unnamed: 0,District,Block,No of samples,Acidic (%),Neutral (%),Alkaline (%),Normal EC (%),OC (%),P (%),K (%),Ca (%),Mg (%),S (%),Zn (%),B (%),Fe (%),Cu (%),Mn (%),Yield (kg/ha)
0,Angul,Angul,150,51,21,29,100,42,45,7,0,11,46,62,76,3,0,7,2634
1,Angul,Athmalik,130,47,31,22,100,37,42,9,5,24,28,55,70,2,0,2,2446
2,Angul,Banarpal,90,30,26,44,100,39,39,0,1,9,13,56,73,3,1,16,2429
3,Angul,Chhendipada,100,60,22,18,100,33,43,13,9,34,24,82,70,11,7,23,2587
4,Angul,Kaniha,150,93,1,5,100,34,49,25,11,35,33,76,80,1,2,17,2641


In [228]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   District       310 non-null    object
 1   Block          310 non-null    object
 2   No of samples  310 non-null    int64 
 3   Acidic (%)     310 non-null    int64 
 4   Neutral (%)    310 non-null    int64 
 5   Alkaline (%)   310 non-null    int64 
 6   Normal EC (%)  310 non-null    int64 
 7   OC (%)         310 non-null    int64 
 8   P (%)          310 non-null    int64 
 9   K (%)          310 non-null    int64 
 10  Ca (%)         310 non-null    int64 
 11  Mg (%)         310 non-null    int64 
 12  S (%)          310 non-null    int64 
 13  Zn (%)         310 non-null    int64 
 14  B (%)          310 non-null    int64 
 15  Fe (%)         310 non-null    int64 
 16  Cu (%)         310 non-null    int64 
 17  Mn (%)         310 non-null    int64 
 18  Yield (kg/ha)  310 non-null   

In [229]:
# Numerical Features
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'object']
print("Numerical Features: ", len(numerical_features))

# Categorical Features
categorical_features = [feature for feature in df.columns if df[feature].dtypes == 'object']
print("Categorical Features: ", len(categorical_features))

# Discrete Features
discrete_features = [feature for feature in numerical_features if len(df[feature].unique()) <= 25]
print("Discrete Features: ", len(discrete_features))

# Continuous Features
continuous_features = [feature for feature in numerical_features if feature not in discrete_features]
print("Continuous Features: ", len(continuous_features))

Numerical Features:  17
Categorical Features:  2
Discrete Features:  4
Continuous Features:  13


In [230]:
df.columns = df.columns.str.strip()

In [231]:
# Independent & Dependent Features
X = df.drop(columns=['Yield (kg/ha)'])
Y = df['Yield (kg/ha)']

In [232]:
X.head()

Unnamed: 0,District,Block,No of samples,Acidic (%),Neutral (%),Alkaline (%),Normal EC (%),OC (%),P (%),K (%),Ca (%),Mg (%),S (%),Zn (%),B (%),Fe (%),Cu (%),Mn (%)
0,Angul,Angul,150,51,21,29,100,42,45,7,0,11,46,62,76,3,0,7
1,Angul,Athmalik,130,47,31,22,100,37,42,9,5,24,28,55,70,2,0,2
2,Angul,Banarpal,90,30,26,44,100,39,39,0,1,9,13,56,73,3,1,16
3,Angul,Chhendipada,100,60,22,18,100,33,43,13,9,34,24,82,70,11,7,23
4,Angul,Kaniha,150,93,1,5,100,34,49,25,11,35,33,76,80,1,2,17


In [233]:
Y.head()

0    2634
1    2446
2    2429
3    2587
4    2641
Name: Yield (kg/ha), dtype: int64

In [234]:
# Applying Label Encoding to Categorical Features
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X['District'] = label_encoder.fit_transform(X['District'])
X['Block'] = label_encoder.fit_transform(X['Block'])

# Create a ColumnTransformer for One-Hot Encoding
numerical_features = X.select_dtypes(exclude=['object']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder', OneHotEncoder(), categorical_features),
        ('StandardScaler', StandardScaler(), numerical_features)
    ], remainder='passthrough'
)

In [235]:
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [236]:
# Scaling the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Model Training

In [237]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

import datetime

In [238]:
model = Sequential([
    # Input Layer connected to the first hidden layer
    # Starts with a reasonable number of neurons (e.g., 128)
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    # Add a Dropout layer to prevent overfitting
    Dropout(0.3),

    # Second hidden layer to learn more complex patterns
    Dense(64, activation='relu'),
    # Add another Dropout layer
    Dropout(0.3),

    # Output layer for binary classification
    Dense(1, activation="linear")
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [239]:
model.summary()

In [240]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
loss = tf.keras.losses.MeanSquaredError()
model.compile(optimizer=optimizer, loss=loss, metrics=['mean_squared_error'])
metrics = ["mae", "mse"]

In [241]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [242]:
early_stopping_callback = EarlyStopping(
    monitor='val_loss', # Monitor validation loss for early stopping
    patience=20, # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored quantity
)

In [243]:
history = model.fit(
    X_train,
    Y_train,
    validation_data=(X_test, Y_test),
    epochs=150,
    callbacks=[tensorflow_callback, early_stopping_callback]
)

Epoch 1/150
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - loss: 8625591.0000 - mean_squared_error: 8625591.0000 - val_loss: 8329832.5000 - val_mean_squared_error: 8329833.0000
Epoch 2/150
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 8438273.0000 - mean_squared_error: 8438273.0000 - val_loss: 7987261.0000 - val_mean_squared_error: 7987261.0000
Epoch 3/150
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 7911069.0000 - mean_squared_error: 7911069.0000 - val_loss: 7161958.5000 - val_mean_squared_error: 7161958.5000
Epoch 4/150
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 6807951.5000 - mean_squared_error: 6807951.5000 - val_loss: 5601734.0000 - val_mean_squared_error: 5601734.0000
Epoch 5/150
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 4842077.5000 - mean_squared_error: 4842077.5000 - val_loss: 3322232.7500 - val_mean_squared_erro

In [246]:
model.save('/Users/rudrasawant/Downloads/Disease Prediction/models/historical_model.h5')

