In [3]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, accuracy_score
import warnings

warnings.filterwarnings("ignore")

# Load the dataset
df = pd.read_excel(r"C:\Users\saurabh shekhar\OneDrive\Desktop\MACHINE LEARNING\ml_qsar.xlsx")
df = df.fillna(0)

# Define features and target variable
y = df["ACTIVITY"]
X = df.drop(['ACTIVITY'], axis=1)

# Replace inf values and drop rows with NaNs
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(inplace=True)
X = X.astype(np.float32)

# Ensure y is aligned with X
y = y.loc[X.index]
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Feature selection
selector = SelectKBest(score_func=f_classif, k=400)
X_new = selector.fit_transform(X_scaled, y)
selected_features = selector.get_support(indices=True)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=99)

# Train the RandomForestClassifier with regularization parameters
clf = RandomForestClassifier(
    n_estimators=100, 
    max_depth=10, 
    min_samples_split=5, 
    min_samples_leaf=2, 
    random_state=42
)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

# Cross-validation scores
cv = StratifiedKFold(n_splits=5)
cross_val_scores = cross_val_score(clf, X_new, y, cv=cv)

print(f'Cross-Validation Scores: {cross_val_scores}')
print(f'Mean Cross-Validation Score: {cross_val_scores.mean()}')

# Save the selected features
selected_feature_names = X.columns[selected_features]
print(f'Selected features: {selected_feature_names}')


Accuracy: 0.7890625
              precision    recall  f1-score   support

           0       0.84      0.83      0.84       249
           1       0.69      0.72      0.71       135

    accuracy                           0.79       384
   macro avg       0.77      0.77      0.77       384
weighted avg       0.79      0.79      0.79       384

Cross-Validation Scores: [0.765625   0.76171875 0.78431373 0.76078431 0.70980392]
Mean Cross-Validation Score: 0.7564491421568628
Selected features: Index(['nAcid', 'ALogP', 'apol', 'naAromAtom', 'nAromBond', 'nAtom',
       'nHeavyAtom', 'nC', 'nN', 'nO',
       ...
       'SRW10', 'TSRW', 'MW', 'WTPT-1', 'WTPT-3', 'WTPT-4', 'WTPT-5', 'WPATH',
       'WPOL', 'Zagreb'],
      dtype='object', length=400)
