In [None]:
# Imports and reads the CSV file, combining the ec# columns to train/test for first 2 ec#s of each protein
import pandas as pd

# Define the file path
file_path = '/Users/carmenshero/Desktop/Crumble_Cookie/FINAL_ACTUALLALY_READY_TO_TRAIN.xlsx'

# Read the Excel file
data = pd.read_excel(file_path)

# Combine ec_first and ec_second into a single target column
data['ec_combined'] = data['ec_first'].astype(str) + '.' + data['ec_second'].astype(str)

# Display the first few rows to confirm successful loading and combination
print("Data Loaded and EC Combined Successfully!")
data.head()

In [None]:
# Performs the train-test split on the data
from sklearn.model_selection import train_test_split

# Define features (X) and the combined target (y)
X = data.drop(columns=['id', 'ec_first', 'ec_second', 'ec_combined'])  # Features
y = data['ec_combined']  # Combined target


# Perform the train-test split (80% train, 20% test split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shape of the splits
print(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}")

In [None]:
# Random Forest Let's goooo
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


# Initialize and train a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the Random Forest
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Optional: Confusion matrix visualization
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(conf_matrix_rf, annot=True, fmt='d', cmap='Blues', 
            xticklabels=rf_classifier.classes_, yticklabels=rf_classifier.classes_)
plt.title("Confusion Matrix for Random Forest")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()


In [None]:
#GridSearch to find best hyperparameters for Random Forest
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt


# Define parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 250, 300, 400],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='f1_weighted',  # Metric for evaluation
    n_jobs=-1  # Use all processors
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters from GridSearchCV:", grid_search.best_params_)
print("Best F1-Weighted Score from GridSearchCV:", grid_search.best_score_)

# Update Random Forest Classifier with best parameters
best_rf_classifier = grid_search.best_estimator_

# Predict on the test set using the tuned model
y_pred_best_rf = best_rf_classifier.predict(X_test)

# Evaluate the tuned model
print("Tuned Random Forest Classification Report:")
print(classification_report(y_test, y_pred_best_rf))

# Confusion matrix visualization for tuned model
conf_matrix_best_rf = confusion_matrix(y_test, y_pred_best_rf)
sns.heatmap(conf_matrix_best_rf, annot=True, fmt='d', cmap='Blues', 
            xticklabels=best_rf_classifier.classes_, yticklabels=best_rf_classifier.classes_)
plt.title("Confusion Matrix for Tuned Random Forest")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()


In [None]:
# Find which features impact the model the most
# Feature importance from the best model
importances = best_rf_classifier.feature_importances_
feature_names = X_train.columns

# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Display feature importance
print("Feature Importances:")
print(feature_importance_df)

# Optional: Plot feature importance
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title("Feature Importance from Tuned Random Forest")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()


In [None]:
# Feature importance for tuned and untuned Random Forest models
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Feature importance from the tuned model
tuned_importances = best_rf_classifier.feature_importances_

# Feature importance from the default model
default_rf_classifier = RandomForestClassifier(random_state=42)  # Default RF
default_rf_classifier.fit(X_train, y_train)  # Train default RF on the training set
default_importances = default_rf_classifier.feature_importances_

# Create a DataFrame for comparison
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Tuned Importance': tuned_importances,
    'Default Importance': default_importances
}).sort_values(by='Tuned Importance', ascending=False)

# Display feature importance
print("Feature Importances (Comparison):")
print(feature_importance_df)

# Optional: Plot feature importance for both models
plt.figure(figsize=(12, 8))
sns.barplot(
    x='Tuned Importance',
    y='Feature',
    data=feature_importance_df,
    color="blue",
    label="Tuned RF"
)
sns.barplot(
    x='Default Importance',
    y='Feature',
    data=feature_importance_df,
    color="orange",
    alpha=0.6,
    label="Default RF"
)
plt.title("Feature Importance: Tuned vs. Default Random Forest")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.legend()
plt.show()


In [None]:
# Experiment to compare default and GridSearch-tuned hyperparameters
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd

# Random states to test
random_states = [0, 21, 42, 84, 123]

# Initialize lists to store results
default_results = []
tuned_results = []

# Loop over random states
for state in random_states:
    print(f"\n=== Testing with Random State: {state} ===")
    
    # Default Random Forest
    default_rf = RandomForestClassifier(random_state=state)
    default_rf.fit(X_train, y_train)
    y_pred_default = default_rf.predict(X_test)
    default_report = classification_report(y_test, y_pred_default, output_dict=True)
    default_results.append({
        'Random State': state,
        'Precision': default_report['weighted avg']['precision'],
        'Recall': default_report['weighted avg']['recall'],
        'F1-Score': default_report['weighted avg']['f1-score']
    })

    # Tuned Random Forest
    tuned_rf = RandomForestClassifier(
        random_state=state,
        n_estimators=grid_search.best_params_['n_estimators'],
        max_depth=grid_search.best_params_['max_depth'],
        min_samples_split=grid_search.best_params_['min_samples_split'],
        min_samples_leaf=grid_search.best_params_['min_samples_leaf']
    )
    tuned_rf.fit(X_train, y_train)
    y_pred_tuned = tuned_rf.predict(X_test)
    tuned_report = classification_report(y_test, y_pred_tuned, output_dict=True)
    tuned_results.append({
        'Random State': state,
        'Precision': tuned_report['weighted avg']['precision'],
        'Recall': tuned_report['weighted avg']['recall'],
        'F1-Score': tuned_report['weighted avg']['f1-score']
    })

# Convert results to DataFrames for easier analysis
default_df = pd.DataFrame(default_results)
tuned_df = pd.DataFrame(tuned_results)

# Display results
print("\nDefault Random Forest Results:")
print(default_df)
print("\nTuned Random Forest Results:")
print(tuned_df)


In [None]:
import matplotlib.pyplot as plt

# Plot comparison of F1-Scores
plt.figure(figsize=(10, 6))
plt.plot(default_df['Random State'], default_df['F1-Score'], label='Default RF', marker='o')
plt.plot(tuned_df['Random State'], tuned_df['F1-Score'], label='Tuned RF', marker='o')
plt.title("Comparison of F1-Scores Across Random States")
plt.xlabel("Random State")
plt.ylabel("F1-Score (Weighted)")
plt.legend()
plt.grid()
plt.show()

# Optional: Compare precision and recall
plt.figure(figsize=(10, 6))
plt.plot(default_df['Random State'], default_df['Precision'], label='Default RF Precision', marker='o')
plt.plot(tuned_df['Random State'], tuned_df['Precision'], label='Tuned RF Precision', marker='o')
plt.title("Comparison of Precision Across Random States")
plt.xlabel("Random State")
plt.ylabel("Precision (Weighted)")
plt.legend()
plt.grid()
plt.show()

plt.figure(figsize=(10, 6))
plt.plot(default_df['Random State'], default_df['Recall'], label='Default RF Recall', marker='o')
plt.plot(tuned_df['Random State'], tuned_df['Recall'], label='Tuned RF Recall', marker='o')
plt.title("Comparison of Recall Across Random States")
plt.xlabel("Random State")
plt.ylabel("Recall (Weighted)")
plt.legend()
plt.grid()
plt.show()
