In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns


# Step 1: Load the Excel file
# Replace 'Sheet1' with the actual sheet name if it's different
df = pd.read_excel('Important_Features.xlsx', sheet_name='Sheet1')
df

FileNotFoundError: [Errno 2] No such file or directory: 'Important_Features.xlsx'

In [None]:
# Create a list for the target values
target_values = [0]*50 + [1]*50

# Add 'target' column to the DataFrame
df['target'] = target_values
df

In [None]:
from sklearn.model_selection import train_test_split

# Step 2: Prepare features and target
# If your file has a 'target' column, this will work as is.
# If the target column is named differently, adjust accordingly.
X = df.drop(columns=['target'], errors='ignore')
y = df['target']

# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,         # 20% data for testing # By default, train_test_split does shuffle the data before splitting.
    random_state=42,       # for reproducibility
    stratify=y             # important if target classes are imbalanced (guarantees that both train and test will have the same proportion of class 0 and class 1!)
)

# Step 3: Fit Random Forest
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 4: Get feature importances
importances = clf.feature_importances_
feature_names = X_train.columns
importance_df = pd.DataFrame({'gene': feature_names, 'importance': importances})

# Step 5: Sort and display top features
top_features = importance_df.sort_values(by='importance', ascending=False).head(100)
print(top_features)

# Optional: Save the top features to a new Excel file
top_features.to_excel('Top_Important_Features.xlsx', index=False)


In [None]:
features=pd.read_excel('Top_Important_Features.xlsx')
features

In [None]:
genes_array=features['gene'].values

In [None]:
new_df = df.loc[:,df.columns.isin(genes_array)]

new_df

In [None]:
# Create a list for the target values
target_values = [0]*50 + [1]*50

# Add 'target' column to the DataFrame
new_df['target'] = target_values
new_df

In [None]:
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

# Convert all columns to string type before scaling
new_df.columns = new_df.columns.astype(str)

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(new_df.iloc[:, :-1])  # Exclude the target column

# Run t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
tsne_results = tsne.fit_transform(data_scaled)

# # Plot
# plt.figure(figsize=(10,8))
# sns.scatterplot(x=tsne_results[:,0], y=tsne_results[:,1], s=60)
# plt.title('t-SNE visualization of 50non50tumor important dataset')
# plt.xlabel('TSNE-1')
# plt.ylabel('TSNE-2')
# plt.grid(True)
# plt.show()

import matplotlib.pyplot as plt

# Assuming tsne_results (shape: [n_samples, 2]) and new_df (with 'target' column) are defined
labels = new_df['target'].values  # or use the correct column name for your target

plt.figure(figsize=(10,8))
scatter = plt.scatter(
    tsne_results[:, 0], tsne_results[:, 1],
    c=labels, cmap='coolwarm', edgecolor='k', alpha=0.7
)
plt.title('t-SNE visualization of top genes target color coding')
plt.xlabel('TSNE-1')
plt.ylabel('TSNE-2')
plt.colorbar(scatter, label='Target Label')
plt.grid(True)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X = new_df.drop(columns=['target'])  # all columns except 'target'
y = new_df['target']                 # target column

# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,         # 20% data for testing # By default, train_test_split does shuffle the data before splitting.
    random_state=42,       # for reproducibility
    stratify=y             # important if target classes are imbalanced (guarantees that both train and test will have the same proportion of class 0 and class 1!)
)

In [None]:
X_train.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Train RandomForest on the PCA-transformed training data
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predict on the PCA-transformed test data
y_pred = rf.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Define classifier
rf = RandomForestClassifier(random_state=42)

# Define Stratified K-Fold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # 5 folds

# Perform Cross-Validation
scores = cross_val_score(rf, X, y, cv=kfold, scoring='accuracy')  # You can also use 'f1', 'roc_auc' etc.

print("Cross-Validation Accuracy Scores:", scores)
print("Mean Accuracy:", np.mean(scores))
print("Standard Deviation:", np.std(scores))

# To get confusion matrix and classification report from cross-validation:
# Need to do manual cross-validation (because cross_val_score only gives scores)

from sklearn.model_selection import cross_val_predict

y_pred = cross_val_predict(rf, X, y, cv=kfold)

print("\nConfusion Matrix:\n", confusion_matrix(y, y_pred))
print("\nClassification Report:\n", classification_report(y, y_pred))