# Step 1 ) Data Import and Preprocessing

In [2]:
#import Necessary libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, SimpleRNN


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Shagun\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Shagun\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Shagun\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\Shagun\anaconda3\Lib\site-

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ImportError: numpy.core.multiarray failed to import

In [None]:
# Load the dataset
data = pd.read_csv("C:\\Users\\rahul\\Downloads\\symbipredict_2022.csv")

In [None]:
# Display basic information about the dataset
print("Data Information:\n")
data.info()

In [None]:
# Check for missing values


# 1) data.isnull().sum() :  calculates the number of missing values in each column.
# 2) missing_values[missing_values > 0] : filters to display only columns with missing values.

#This code will output a list of columns that have missing values and the number of missing values in each, if any.


missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values[missing_values>0])

duplicates = data.duplicated().sum()
print(f"Number of Duplicate Rows: {duplicates}")

In [None]:
# Drop duplicate rows if any
data = data.drop_duplicates()

#This line removes any duplicate rows from the dataset.
#By reassigning data = data.drop_duplicates(), we are  ensuring that data is updated without duplicates.

# Fill missing values with 0 (assuming missing values imply absence of symptom)


#This fills all missing values with 0, assuming that missing values indicate the absence of a symptom.
#We could also use other values (like the mean, median, or mode) if that makes more sense for our data, but 0 is reasonable for binary symptom data.
data = data.fillna(0)

# Verify no missing values remain

#This checks for any remaining missing values in the entire dataset.
#data.isnull().sum().sum() gives the total count of missing values across all columns and rows. If this outputs 0, it confirms that there are no missing values left.


print("Remaining Missing Values:\n", data.isnull().sum().sum())


# Step 2 ) Exploratory Data Analysis (EDA)

In [None]:
data.shape

In [None]:
#Jupyter displays the head() function output as a structured table by default, providing a nice view of rows and columns.
#This is automatic when you call data.head() without using print()

# keep the structured table display, simply call data.head() without print()
##Wrapping data.head() in print() outputs it as plain text, without the table formatting. This leads to a less structured display, with text-based row and column formatting.
#This output appears more like raw data in text form.



data.head()

In [None]:
# data.describe() : to get a summary of the numerical data, which includes count, mean, standard deviation, min, and max values.
# help understand the range and distribution of each symptom.

data.describe()

In [None]:
print(data['prognosis'].value_counts())  # Count of each prognosis


In [None]:
# Checking the number of unique target classes (prognosis):
unique_prognoses = data['prognosis'].nunique()
print(f"Number of unique prognoses: {unique_prognoses}")
print('\n')
#Listing all unique target classes (prognosis):
prognoses_list = data['prognosis'].unique()
print("Unique prognoses:\n")
print(prognoses_list)

In [None]:
#Visualizing the distribution of target classes
plt.figure(figsize=(10, 6))
sns.countplot(y='prognosis', data=data, palette="Set2")  
plt.title('Distribution of Prognosis')
plt.xlabel('Count')
plt.ylabel('Prognosis')
plt.show()

In [None]:
# Check the data types of your columns
print(data.dtypes)


In [None]:
# Remove the 'prognosis' column from the dataset
numeric_data = data.drop(columns=['prognosis'])

# Calculate the correlation matrix for the numerical features
correlation_matrix = numeric_data.corr()

# Visualize the correlation matrix
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))  # Adjust the size as needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Symptoms')
plt.show()


In [None]:
# Step 1: Prepare data 
X = data.drop('prognosis', axis=1)  # Features
y = data['prognosis']  # Target

# Step 2: Train a Random Forest model to get feature importance
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

# Step 3: Calculate and display feature importance
importances = model.feature_importances_

# Create a DataFrame to hold feature names and their importance scores
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Display top 40 important features
print("Top 40 Important Features:")
print(feature_importance.head(40))

# Step 4: Visualize the top 30 feature importance with more distance between features
plt.figure(figsize=(14, 12))  # Increase height for more space between the bars

# Plot the top 30 features with horizontal bars
feature_importance.head(30).plot(kind='barh', x='Feature', y='Importance', legend=False, color='skyblue')

# Adjust the font sizes for better clarity
plt.xlabel('Importance Score', fontsize=14)
plt.ylabel('Features', fontsize=14)
plt.title('Top 30 Feature Importance', fontsize=16)

# Rotate the y-axis labels to prevent overlap (set to 0 degrees for horizontal)
plt.yticks(rotation=0, fontsize=12)

# Increase the spacing between the bars
plt.subplots_adjust(left=0.2, right=0.95, top=0.95, bottom=0.05)

# Adjust layout to ensure everything fits well without overlap
plt.tight_layout()

# Reverse the y-axis to show the most important features at the top
plt.gca().invert_yaxis()

# Show the plot
plt.show()



In [None]:
# Set the figure size and adjust the rotation of labels
plt.figure(figsize=(15, 12))

# Create the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

# Rotate labels for better readability
plt.xticks(rotation=90)
plt.yticks(rotation=0)

# Title for the heatmap
plt.title('Correlation Matrix of Symptoms')

# Display the heatmap
plt.show()


# Step 4: Random Forest Classifier on top 30 features

In [None]:
# Select top 30 features based on feature importance
top_30_features = feature_importance.head(30)['Feature']

#  Prepare the data with only top 30 features
X_top_30 = X[top_30_features]  # Select only the top 30 important features

#  Split the data into training and testing sets (70% training, 30% testing)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_top_30, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Classifier with 100 trees
rfc_top_30 = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model with the top 30 features
rfc_top_30.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rfc_top_30.predict(X_test)

# Evaluate the model

#  Accuracy on the test set
from sklearn.metrics import accuracy_score
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# 11b. Accuracy on the training set
train_accuracy = rfc_top_30.score(X_train, y_train)
print(f"Training Accuracy: {train_accuracy:.4f}")

#  Cross-validation scores (5-fold)
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(rfc_top_30, X_train, y_train, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean Cross-validation score: {cv_scores.mean():.4f}")

#  Classification report (precision, recall, F1-score)
from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Random Forest Classifier on top 40 features:

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Prepare the data (assuming 'data' is your dataframe)
X = data.drop('prognosis', axis=1)  # Features
y = data['prognosis']  # Target variable

# Step 2: Get feature importances from Random Forest
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X, y)

# Get feature importances and sort them
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rfc.feature_importances_
})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Step 3: Select the top 40 important features
top_40_features = feature_importances.head(40)['Feature']

# Step 4: Prepare training and testing data with top 40 features
X_selected = X[top_40_features]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

# Step 5: Train the Random Forest Classifier
rfc_top_40 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

#rfc_top_40 = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_top_40.fit(X_train, y_train)

# Step 6: Make predictions
y_pred_top_40 = rfc_top_40.predict(X_test)

# Step 7: Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_top_40)
print(f"Accuracy with top 40 features: {accuracy:.4f}")

# Step 8: Print Classification Report with zero_division parameter to avoid warnings
print("Classification Report:")
print(classification_report(y_test, y_pred_top_40, zero_division=0))  # Set zero_division=0 to handle undefined metrics

# Cross-validation with fewer splits
cv_scores = cross_val_score(rfc_top_40, X_train, y_train, cv=3, scoring='accuracy')
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean():.4f}")




In [None]:
# 1st time CNN TRIED with 40 features

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Step 1: Prepare the data for CNN
X_cnn = X_selected.values.reshape(-1, 40, 1)  # Reshape for 1D convolution
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_cnn = to_categorical(y_encoded)  # One-hot encode target variable

X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(X_cnn, y_cnn, test_size=0.3, random_state=42)

# Step 2: Define the CNN model
cnn_model = Sequential([
    Conv1D(32, kernel_size=3, activation='relu', input_shape=(40, 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(y_cnn.shape[1], activation='softmax')  # Output layer with softmax for multi-class classification
])

cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Step 3: Train the CNN model
cnn_model.fit(X_train_cnn, y_train_cnn, epochs=10, batch_size=16, verbose=1)

# Step 4: Evaluate the CNN model
cnn_accuracy = cnn_model.evaluate(X_test_cnn, y_test_cnn, verbose=0)[1]
print(f"CNN Accuracy: {cnn_accuracy:.4f}")

# Step 5: Classification Report
y_pred_cnn = cnn_model.predict(X_test_cnn).argmax(axis=1)
y_test_true = y_test_cnn.argmax(axis=1)  # Convert back to label indices
print("Classification Report (CNN):")
print(classification_report(y_test_true, y_pred_cnn))


In [None]:
# 1st time rnn tried with 40 features

In [None]:
from tensorflow.keras.layers import SimpleRNN

# Step 1: Prepare the data for RNN (same as CNN)
X_rnn = X_selected.values.reshape(-1, 40, 1)

# Step 2: Define the RNN model
rnn_model = Sequential([
    SimpleRNN(64, activation='relu', input_shape=(40, 1)),
    Dense(64, activation='relu'),
    Dense(y_cnn.shape[1], activation='softmax')  # Output layer
])

rnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Step 3: Train the RNN model
rnn_model.fit(X_train_cnn, y_train_cnn, epochs=10, batch_size=16, verbose=1)

# Step 4: Evaluate the RNN model
rnn_accuracy = rnn_model.evaluate(X_test_cnn, y_test_cnn, verbose=0)[1]
print(f"RNN Accuracy: {rnn_accuracy:.4f}")

# Step 5: Classification Report
y_pred_rnn = rnn_model.predict(X_test_cnn).argmax(axis=1)
print("Classification Report (RNN):")
print(classification_report(y_test_true, y_pred_rnn))


In [None]:
# Final summary
print("for 40 features: \n")
print("\nModel Comparison:")
print(f"Random Forest Accuracy: {accuracy * 100:.2f}%")
print(f"CNN Accuracy: {cnn_accuracy * 100:.2f}%")
print(f"RNN Accuracy: {rnn_accuracy * 100:.2f}%")


rfc_all = RandomForestClassifier(random_state=42)
rfc_all.fit(X_train, y_train)
y_pred_all = rfc_all.predict(X_test)

# Accuracy and report for model with all features
accuracy_all = accuracy_score(y_test, y_pred_all)
print(f"Accuracy with all features: {accuracy_all:.4f}")

print("Classification Report for all features:")
print(classification_report(y_test, y_pred_all, zero_division=0))

# Cross-validation with all features
cv_scores_all = cross_val_score(rfc_all, X_train, y_train, cv=3, scoring='accuracy')
print(f"Cross-validation scores (All Features): {cv_scores_all}")
print(f"Mean cross-validation score (All Features): {cv_scores_all.mean():.4f}")

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Prepare the data (assuming 'data' is your dataframe)
X = data.drop('prognosis', axis=1)  # Features
y = data['prognosis']  # Target variable

# Step 2: Get feature importances from Random Forest
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X, y)

# Get feature importances and sort them
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rfc.feature_importances_
})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Step 3: Select the top 40 important features
top_40_features = feature_importances.head(40)['Feature']

# Step 4: Prepare training and testing data with top 40 features
X_selected = X[top_40_features]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_top_40, X_test_top_40 = X_train[top_40_features], X_test[top_40_features]

# Step 5: Train the Random Forest Classifier with all features
rfc_all = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rfc_all.fit(X_train, y_train)
y_pred_all = rfc_all.predict(X_test)

# Accuracy and report for model with all features
accuracy_all = accuracy_score(y_test, y_pred_all)
print(f"Accuracy with all features: {accuracy_all:.4f}")

print("Classification Report for all features:")
print(classification_report(y_test, y_pred_all, zero_division=0))

# Cross-validation with all features
cv_scores_all = cross_val_score(rfc_all, X_train, y_train, cv=3, scoring='accuracy')
print(f"Cross-validation scores (All Features): {cv_scores_all}")
print(f"Mean cross-validation score (All Features): {cv_scores_all.mean():.4f}")

# Step 6: Train the Random Forest Classifier with top 40 features
rfc_top_40 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rfc_top_40.fit(X_train_top_40, y_train)
y_pred_top_40 = rfc_top_40.predict(X_test_top_40)

# Accuracy and report for model with top 40 features
accuracy_top_40 = accuracy_score(y_test, y_pred_top_40)
print(f"Accuracy with top 40 features: {accuracy_top_40:.4f}")

print("Classification Report for top 40 features:")
print(classification_report(y_test, y_pred_top_40, zero_division=0))

# Cross-validation with top 40 features
cv_scores_top_40 = cross_val_score(rfc_top_40, X_train_top_40, y_train, cv=3, scoring='accuracy')
print(f"Cross-validation scores (Top 40 Features): {cv_scores_top_40}")
print(f"Mean cross-validation score (Top 40 Features): {cv_scores_top_40.mean():.4f}")


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Step 1: Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred_top_40)

# Ensure that the labels match the unique classes in the prediction
unique_labels = sorted(set(y_test) | set(y_pred_top_40))  # Unique labels from both y_test and y_pred

# Display the confusion matrix
cmd = ConfusionMatrixDisplay(cm, display_labels=unique_labels)
cmd.plot(cmap='Blues')
plt.title("Confusion Matrix for Top 40 Features")
plt.show()


In [None]:
# Step 2: Building and Training a CNN Model

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize the data to ensure better performance with neural networks
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Reshape the data to a "pseudo-image" format: (samples, rows, columns, channels)
# Here we reshape each sample to 1x132 "image" with 1 channel (for simplicity)
X_scaled = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)  # (samples, features, 1)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

# Initialize the CNN model
cnn_model = Sequential()

# Add the first convolutional layer
cnn_model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_scaled.shape[1], 1)))

# Add a max pooling layer
cnn_model.add(MaxPooling1D(pool_size=2))

# Flatten the output from the convolutional layers
cnn_model.add(Flatten())

# Add a fully connected layer
cnn_model.add(Dense(units=128, activation='relu'))

# Output layer with softmax activation (for multi-class classification)
cnn_model.add(Dense(units=len(np.unique(y)), activation='softmax'))  # Number of classes

# Compile the model
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:

# Import necessary libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout
from tensorflow.keras.utils import to_categorical

# One-hot encoding the target variable
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['prognosis'])
y_onehot = to_categorical(y_encoded)

# Normalizing feature data
X = data.drop(columns=['prognosis'])
X_normalized = X / X.max()

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y_onehot, test_size=0.2, random_state=42)

# Reshaping input for CNN
X_train_cnn = X_train.values.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_cnn = X_test.values.reshape(X_test.shape[0], X_test.shape[1], 1)

# Building the CNN model
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
    Dropout(0.3),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(y_onehot.shape[1], activation='softmax')
])

# Compiling the model
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training the model
cnn_history = cnn_model.fit(X_train_cnn, y_train, validation_data=(X_test_cnn, y_test), epochs=20, batch_size=32)
    

In [None]:

# Import necessary libraries for RNN
from tensorflow.keras.layers import LSTM

# Reshaping input for RNN
X_train_rnn = X_train.values.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test_rnn = X_test.values.reshape(X_test.shape[0], 1, X_test.shape[1])

# Building the RNN model
rnn_model = Sequential([
    LSTM(64, activation='relu', input_shape=(1, X_train_rnn.shape[2])),
    Dense(128, activation='relu'),
    Dense(y_onehot.shape[1], activation='softmax')
])

# Compiling the model
rnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training the model
rnn_history = rnn_model.fit(X_train_rnn, y_train, validation_data=(X_test_rnn, y_test), epochs=20, batch_size=32)
    

In [None]:

# Evaluate CNN model
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test_cnn, y_test)
print(f"CNN Accuracy: {cnn_accuracy*100:.2f}%")

# Evaluate RNN model
rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test_rnn, y_test)
print(f"RNN Accuracy: {rnn_accuracy*100:.2f}%")

# Compare with Random Forest (from earlier work)
print("Include your Random Forest metrics here for comparison.")
    

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout, LSTM, Bidirectional
from tensorflow.keras.utils import to_categorical
from sklearn.utils.class_weight import compute_class_weight

# Load dataset


# Encode target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(data['prognosis'])
y_onehot = to_categorical(y_encoded)

# Normalize feature data
X = data.drop(columns=['prognosis'])
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Split data into train and test sets (with stratification)
X_train, X_test, y_train, y_test = train_test_split(
    X_normalized, y_onehot, test_size=0.2, stratify=y_encoded, random_state=42
)


In [None]:
# Reshape data for CNN
X_train_cnn = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_cnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Compute class weights to handle imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_encoded), y=y_encoded)
class_weights = dict(enumerate(class_weights))

# Build CNN model
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
    Dropout(0.3),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(y_onehot.shape[1], activation='softmax')
])

cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train CNN model
cnn_history = cnn_model.fit(
    X_train_cnn, y_train,
    validation_data=(X_test_cnn, y_test),
    epochs=50,
    batch_size=32,
    class_weight=class_weights
)

# Evaluate CNN model
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test_cnn, y_test)
print(f"CNN Accuracy: {cnn_accuracy*100:.2f}%")


In [None]:
# Reshape data for RNN
X_train_rnn = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test_rnn = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

# Build RNN model with LSTM
rnn_model = Sequential([
    Bidirectional(LSTM(64, activation='relu', return_sequences=False), input_shape=(1, X_train.shape[1])),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(y_onehot.shape[1], activation='softmax')
])

rnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train RNN model
rnn_history = rnn_model.fit(
    X_train_rnn, y_train,
    validation_data=(X_test_rnn, y_test),
    epochs=50,
    batch_size=32,
    class_weight=class_weights
)

# Evaluate RNN model
rnn_loss, rnn_accuracy = rnn_model.evaluate(X_test_rnn, y_test)
print(f"RNN Accuracy: {rnn_accuracy*100:.2f}%")


In [None]:
# Print metrics
print(f"Random Forest Accuracy: {accuracy*100:.2f}%")
print(f"CNN Accuracy: {cnn_accuracy*100:.2f}%")
print(f"RNN Accuracy: {rnn_accuracy*100:.2f}%")


In [None]:
import matplotlib.pyplot as plt

# Plot CNN training history
plt.plot(cnn_history.history['accuracy'], label='CNN Train Accuracy')
plt.plot(cnn_history.history['val_accuracy'], label='CNN Validation Accuracy')
plt.title('CNN Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot RNN training history
plt.plot(rnn_history.history['accuracy'], label='RNN Train Accuracy')
plt.plot(rnn_history.history['val_accuracy'], label='RNN Validation Accuracy')
plt.title('RNN Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming data is already loaded in 'X' (features) and 'y' (labels)

# Step 1: Reshape data for RNN (since it's not time-series, we add 1 timestep)
X_reshaped = X.values.reshape((X.shape[0], 1, X.shape[1]))

# Step 2: Encode the target variable
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# Step 3: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_encoded, test_size=0.2, random_state=42)

# Step 4: Build the RNN model
model = Sequential()
model.add(SimpleRNN(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(np.unique(y)), activation='softmax'))  # Multi-class classification

# Step 5: Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 6: Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Step 7: Make predictions and evaluate the model
y_pred_rnn = model.predict(X_test)
y_pred_rnn = np.argmax(y_pred_rnn, axis=1)  # Convert one-hot output to class index

# Step 8: Evaluate model
accuracy_rnn = accuracy_score(y_test, y_pred_rnn)
print("\nRNN Accuracy:", accuracy_rnn)
print("\nClassification Report (RNN):")
print(classification_report(y_test, y_pred_rnn))

# Step 9: Confusion Matrix for RNN
print("\nConfusion Matrix (RNN):")
cm_rnn = confusion_matrix(y_test, y_pred_rnn)
sns.heatmap(cm_rnn, annot=True, fmt="d", cmap="Blues", xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.title('Confusion Matrix with RNN')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Reshape the data for CNN (Add a channel dimension)
X_reshaped_cnn = X.values.reshape((X.shape[0], X.shape[1], 1))  # Add a channel dimension

# Step 2: Encode the target variable
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# Step 3: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_reshaped_cnn, y_encoded, test_size=0.2, random_state=42)

# Step 4: Build the CNN model
model_cnn = Sequential()
model_cnn.add(Conv1D(64, 3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model_cnn.add(MaxPooling1D(2))
model_cnn.add(Flatten())
model_cnn.add(Dropout(0.2))
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dense(len(np.unique(y)), activation='softmax'))  # Multi-class classification

# Step 5: Compile the model
model_cnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 6: Train the model
history_cnn = model_cnn.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Step 7: Make predictions and evaluate the model
y_pred_cnn = model_cnn.predict(X_test)
y_pred_cnn = np.argmax(y_pred_cnn, axis=1)  # Convert one-hot output to class index

# Step 8: Evaluate model
accuracy_cnn = accuracy_score(y_test, y_pred_cnn)
print("\nCNN Accuracy:", accuracy_cnn)
print("\nClassification Report (CNN):")
print(classification_report(y_test, y_pred_cnn))

# Step 9: Confusion Matrix for CNN
print("\nConfusion Matrix (CNN):")
cm_cnn = confusion_matrix(y_test, y_pred_cnn)
sns.heatmap(cm_cnn, annot=True, fmt="d", cmap="Blues", xticklabels=encoder.classes_, yticklabels=encoder.classes_)
plt.title('Confusion Matrix with CNN')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
