In [None]:
pip install pandas openpyxl



In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Smiles.csv')

# Inspect the first few rows to ensure it loaded correctly
print("First few rows of the dataframe:")
print(df.head())

# Check for any missing values and handle them if necessary
if df.isnull().values.any():
    print("Handling missing values...")
    df = df.dropna()  # Drop rows with missing values
    # Alternatively, you could fill them with a default value or use other imputation methods

# Rename columns if needed (based on your image, it looks like 'Name' and 'smiles')
df.columns = ['Name', 'smiles']

# Inspect the cleaned dataframe
print("Cleaned dataframe:")
print(df.head())

# Convert to JSON
json_output = df.to_json(orient='records')

# Save to a JSON file
with open('output.json', 'w') as json_file:
    json_file.write(json_output)

print("Data has been converted to JSON and saved to output.json")


First few rows of the dataframe:
  Name                                             smiles
0   A1  O=C1c2ccc3c4c2c2-c5c6c7c(cc15)C(=O)[n]1c([n]c5...
1   A2  O=C1OC(=NN1c1[n][n]c(o1)-c1cc([n]c2ccccc21)-c1...
2   A3  N#Cc1c(C)c(/C=N/c2cccc(n2)/N=C/c2c(C)c(C#N)c3n...
3   A5  N#Cc1c(C)c(/C=N/c2ccc(c(c2)/N=C/c2c(C)c(C#N)c3...
4   A6  [O-][N+](=O)c1cc(ccc1)C1CC(=NN1c1ccc(cc1)[N+](...
Cleaned dataframe:
  Name                                             smiles
0   A1  O=C1c2ccc3c4c2c2-c5c6c7c(cc15)C(=O)[n]1c([n]c5...
1   A2  O=C1OC(=NN1c1[n][n]c(o1)-c1cc([n]c2ccccc21)-c1...
2   A3  N#Cc1c(C)c(/C=N/c2cccc(n2)/N=C/c2c(C)c(C#N)c3n...
3   A5  N#Cc1c(C)c(/C=N/c2ccc(c(c2)/N=C/c2c(C)c(C#N)c3...
4   A6  [O-][N+](=O)c1cc(ccc1)C1CC(=NN1c1ccc(cc1)[N+](...
Data has been converted to JSON and saved to output.json


In [None]:
pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.6


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors

# Load the dataset
input_csv = 'Smiles.csv'
df = pd.read_csv(input_csv)

# List of descriptor names
descriptor_names = [desc[0] for desc in Descriptors._descList]

# Create a descriptor calculator
calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

# Function to calculate descriptors from a SMILES string
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return calculator.CalcDescriptors(mol)
    else:
        return [None] * len(descriptor_names)

# Apply the function to calculate descriptors for each SMILES
descriptor_values = df['smiles'].apply(calculate_descriptors)

# Create a DataFrame for descriptors
descriptors_df = pd.DataFrame(descriptor_values.tolist(), columns=descriptor_names)

# Concatenate the original DataFrame with the descriptors DataFrame
final_df = pd.concat([df, descriptors_df], axis=1)

# Save the final DataFrame with descriptors to a new CSV file
output_csv = 'final_output_with_descriptors.csv'
final_df.to_csv(output_csv, index=False)

print(f"Descriptors have been calculated and saved to {output_csv}")


Descriptors have been calculated and saved to final_output_with_descriptors.csv


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Load the data
df = pd.read_csv('Smilesactivity.csv')

# Check the exact column names
print("Columns in the DataFrame:", df.columns.tolist())

# Rename the 'activity' column if it has leading/trailing spaces or other issues
df.rename(columns=lambda x: x.strip(), inplace=True)

# Verify if 'activity' column is now correctly identified
if 'activity' not in df.columns:
    raise KeyError("The 'activity' column is not present in the DataFrame. Please check the column names.")

# Ensure 'activity' column is binary ("Changer" or "NoChanger")
df['activity'] = df['activity'].apply(lambda x: 'Changer' if x == 'Changer' else 'NoChanger')

# Drop unnecessary columns (keeping 'Name' and 'smiles' as features)
X = df[['Name', 'smiles']]
y = df['activity']

# Check if X and y have the correct number of samples
print("Features shape:", X.shape)
print("Target shape:", y.shape)

# Ensure X and y are not empty
if X.shape[0] == 0 or y.shape[0] == 0:
    raise ValueError("The dataset is empty. Please check the data loading and preprocessing steps.")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model (using only numerical features if applicable)
# Assuming 'Name' and 'smiles' are strings, we need to convert them into numerical features
# For this example, let's drop them and use a dummy feature
import numpy as np
X_train = np.random.rand(X_train.shape[0], 1)  # Dummy feature
X_test = np.random.rand(X_test.shape[0], 1)  # Dummy feature

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Columns in the DataFrame: ['Name', 'smiles', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'activity ']
Features shape: (88, 2)
Target shape: (88,)
Accuracy: 0.6111111111111112
Classification Report:
               precision    recall  f1-score   support

     Changer       0.67      0.25      0.36         8
   NoChanger       0.60      0.90      0.72        10

    accuracy                           0.61        18
   macro avg       0.63      0.57      0.54        18
weighted avg       0.63      0.61      0.56        18



In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
input_csv = 'Smilesactivity.csv'
data = pd.read_csv(input_csv)

# Strip any leading or trailing spaces in column names
data.columns = data.columns.str.strip()

# Print column names to check for inconsistencies
print("Columns in the DataFrame:", data.columns)

# Ensure the DataFrame has 'smiles' and 'activity' columns
if 'smiles' not in data.columns:
    raise ValueError("The CSV file must contain 'smiles' column")
if 'activity' not in data.columns:
    raise ValueError("The CSV file must contain 'activity' column")

# Print the first few rows to verify the data
print("DataFrame head:\n", data.head())

# For demonstration, ensure 'activity' has valid data
data['activity'] = data['activity'].apply(lambda x: 1 if x.strip() == 'Changer' else 0)

# List of descriptor names
descriptor_names = [desc[0] for desc in Descriptors._descList]

# Create a descriptor calculator
calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

# Function to calculate descriptors from a SMILES string
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return calculator.CalcDescriptors(mol)
    else:
        return [None] * len(descriptor_names)

# Calculate descriptors for each molecule
data['descriptors'] = data['smiles'].apply(calculate_descriptors)

# Split descriptors into separate columns
descriptors_df = pd.DataFrame(data['descriptors'].tolist(), columns=descriptor_names)

# Rename 'activity' column in case it is mistakenly created by molecular descriptor calculations
if 'activity' in descriptors_df.columns:
    descriptors_df = descriptors_df.rename(columns={'activity': 'activity_descriptor'})

# Concatenate original data with descriptors
data = pd.concat([data, descriptors_df], axis=1).drop(columns=['descriptors'])

# Drop rows with None (NaN) values in descriptors
data = data.dropna()

# Prepare dataset for modeling
X = data[descriptor_names]
y = data['activity']

# Ensure X and y are not empty
if X.shape[0] == 0 or y.shape[0] == 0:
    raise ValueError("The dataset is empty. Please check the data loading and preprocessing steps.")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Save the final DataFrame with descriptors to a new CSV file
output_csv = 'final_output_with_test.csv'
data.to_csv(output_csv, index=False)
print(f"Descriptors and data have been saved to {output_csv}")


Columns in the DataFrame: Index(['Name', 'smiles', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       'Unnamed: 10', 'activity'],
      dtype='object')
DataFrame head:
   Name                                             smiles  Unnamed: 2  \
0   A1  O=C1c2ccc3c4c2c2-c5c6c7c(cc15)C(=O)[n]1c([n]c5...         NaN   
1   A2  O=C1OC(=NN1c1[n][n]c(o1)-c1cc([n]c2ccccc21)-c1...         NaN   
2   A3  N#Cc1c(C)c(/C=N/c2cccc(n2)/N=C/c2c(C)c(C#N)c3n...         NaN   
3   A5  N#Cc1c(C)c(/C=N/c2ccc(c(c2)/N=C/c2c(C)c(C#N)c3...         NaN   
4   A6  [O-][N+](=O)c1cc(ccc1)C1CC(=NN1c1ccc(cc1)[N+](...         NaN   

   Unnamed: 3  Unnamed: 4  Unnamed: 5  Unnamed: 6  Unnamed: 7  Unnamed: 8  \
0         NaN         NaN         NaN         NaN         NaN         NaN   
1         NaN         NaN         NaN         NaN         NaN         NaN   
2         NaN         NaN         NaN         NaN         NaN         NaN   
3        

ValueError: The dataset is empty. Please check the data loading and preprocessing steps.

In [None]:
pip install pandas openpyxl



In [None]:
import pandas as pd

# Load the workbook
excel_file = 'Padel 1.xlsx'
# Read all sheets
sheets = pd.read_excel(excel_file, sheet_name=None)

In [None]:
import pandas as pd

# Load the workbook
excel_file = 'Padel 1.xlsx'
sheets = pd.read_excel(excel_file, sheet_name=None)

# Function to run your model
def run_model(data):
    # Placeholder for model processing
    # Replace with your model code
    results = data.describe()  # Example operation
    return results

# Dictionary to hold results
results = {}

for sheet_name, data in sheets.items():
    print(f"Processing sheet: {sheet_name}")
    results[sheet_name] = run_model(data)

# Save results to a new Excel file
with pd.ExcelWriter('results.xlsx') as writer:
    for sheet_name, result in results.items():
        result.to_excel(writer, sheet_name=sheet_name)


Processing sheet: Original Data
Processing sheet: Original Data (2)
Processing sheet: Transposed
Processing sheet: Topological
Processing sheet: Geometric
Processing sheet: Fingerprint
Processing sheet: Constitutional


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load the workbook
excel_file = 'Padel 1.xlsx'
sheets = pd.read_excel(excel_file, sheet_name=None)

# Function to run Random Forest model
def run_random_forest(data):
    # Assuming the class label is in the last column
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initializing and training the Random Forest model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Making predictions
    y_pred = model.predict(X_test)

    # Getting classification report
    report = classification_report(y_test, y_pred, output_dict=True)

    # Returning the classification report as a DataFrame
    return pd.DataFrame(report).transpose()

# Dictionary to hold results
results = {}

for sheet_name, data in sheets.items():
    print(f"Processing sheet: {sheet_name}")
    results[sheet_name] = run_random_forest(data)

# Save results to a new Excel file
with pd.ExcelWriter('results1.xlsx') as writer:
    for sheet_name, result in results.items():
        result.to_excel(writer, sheet_name=sheet_name)

print("Processing complete. Results saved to 'results1.xlsx'.")


Processing sheet: Original Data


ValueError: could not convert string to float: 'N4'

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load the workbook
excel_file = 'Padel 1.xlsx'
sheets = pd.read_excel(excel_file, sheet_name=None)

# Function to run Random Forest model
def run_random_forest(data):
    # Assuming the class label is in the last column
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    # Convert categorical variables to dummy/indicator variables
    X = pd.get_dummies(X)

    # Check if target variable is categorical and convert if necessary
    if y.dtype == 'object' or isinstance(y.dtype, pd.CategoricalDtype):
        y = pd.factorize(y)[0]

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initializing and training the Random Forest model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Making predictions
    y_pred = model.predict(X_test)

    # Getting classification report
    report = classification_report(y_test, y_pred, output_dict=True)

    # Returning the classification report as a DataFrame
    return pd.DataFrame(report).transpose()

# Dictionary to hold results
results = {}

for sheet_name, data in sheets.items():
    print(f"Processing sheet: {sheet_name}")
    results[sheet_name] = run_random_forest(data)

# Save results to a new Excel file
with pd.ExcelWriter('resultsnew.xlsx') as writer:
    for sheet_name, result in results.items():
        result.to_excel(writer, sheet_name=sheet_name)

print("Processing complete. Results saved to 'resultsnew.xlsx'.")


Processing sheet: Original Data
Processing sheet: Original Data (2)
Processing sheet: Transposed


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


ValueError: Unknown label type: 'continuous'

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Load the workbook
excel_file = 'Padel 1.xlsx'
sheets = pd.read_excel(excel_file, sheet_name=None)

# Function to run Random Forest model
def run_random_forest(data):
    # Assuming the class label is in the last column
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    # Convert categorical variables to dummy/indicator variables
    X = pd.get_dummies(X)

    # Ensure target variable is categorical
    if y.dtype == 'float64' or y.dtype == 'int64':
        unique_values = y.nunique()
        if unique_values <= 20:  # arbitrary threshold to determine if the column is categorical
            y = y.astype('category')
        else:
            raise ValueError("Target variable seems to be continuous, cannot perform classification.")

    # Convert target variable to numeric if it's categorical
    if y.dtype.name == 'category':
        le = LabelEncoder()
        y = le.fit_transform(y)

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

    # Initializing and training the Random Forest model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Making predictions
    y_pred = model.predict(X_test)

    # Getting classification report
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    # Returning the classification report as a DataFrame
    return pd.DataFrame(report).transpose()

# Dictionary to hold results
results = {}

for sheet_name, data in sheets.items():
    try:
        print(f"Processing sheet: {sheet_name}")
        results[sheet_name] = run_random_forest(data)
    except ValueError as e:
        print(f"Skipping sheet {sheet_name} due to error: {e}")

# Save results to a new Excel file
with pd.ExcelWriter('results.xlsx') as writer:
    for sheet_name, result in results.items():
        result.to_excel(writer, sheet_name=sheet_name)

print("Processing complete. Results saved to 'results.xlsx'.")


Processing sheet: Original Data
Processing sheet: Original Data (2)
Processing sheet: Transposed
Skipping sheet Transposed due to error: Target variable seems to be continuous, cannot perform classification.
Processing sheet: Topological
Processing sheet: Geometric
Processing sheet: Fingerprint
Processing sheet: Constitutional
Processing complete. Results saved to 'results.xlsx'.


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Load the workbook
excel_file = 'Padel 1.xlsx'
sheets = pd.read_excel(excel_file, sheet_name=None)

# Function to run Random Forest model
def run_random_forest(data):
    # Assuming the class label is in the last column
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    # Convert categorical variables to dummy/indicator variables
    X = pd.get_dummies(X)

    # Ensure target variable is categorical
    if y.dtype == 'float64' or y.dtype == 'int64':
        unique_values = y.nunique()
        if unique_values <= 20:  # arbitrary threshold to determine if the column is categorical
            y = y.astype('category')
        else:
            raise ValueError("Target variable seems to be continuous, cannot perform classification.")

    # Convert target variable to numeric if it's categorical
    if y.dtype.name == 'category':
        le = LabelEncoder()
        y = le.fit_transform(y)

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Initializing and training the Random Forest model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Making predictions
    y_pred = model.predict(X_test)

    # Getting classification report
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    # Returning the classification report as a DataFrame
    return pd.DataFrame(report).transpose()

# Dictionary to hold results
results = {}

# Process each sheet separately
for sheet_name, data in sheets.items():
    try:
        print(f"Processing sheet: {sheet_name}")
        results[sheet_name] = run_random_forest(data)
    except ValueError as e:
        print(f"Skipping sheet {sheet_name} due to error: {e}")

# Save results to a new Excel file
with pd.ExcelWriter('results.xlsx') as writer:
    for sheet_name, result in results.items():
        result.to_excel(writer, sheet_name=sheet_name)

print("Processing complete. Results saved to 'results.xlsx'.")


Processing sheet: Original Data
Processing sheet: Original Data (2)
Processing sheet: Transposed
Skipping sheet Transposed due to error: Target variable seems to be continuous, cannot perform classification.
Processing sheet: Topological
Processing sheet: Geometric
Processing sheet: Fingerprint
Processing sheet: Constitutional
Processing complete. Results saved to 'results.xlsx'.


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Load the workbook
excel_file = 'Padel 1.xlsx'
sheets = pd.read_excel(excel_file, sheet_name=None)

# Function to run Random Forest model with handling class imbalance
def run_random_forest(data):
    # Assuming the class label is in the last column
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    # Convert categorical variables to dummy/indicator variables
    X = pd.get_dummies(X)

    # Ensure no boolean columns are present (convert to integers)
    for col in X.columns:
        if X[col].dtype == 'bool':
            X[col] = X[col].astype(int)

    # Ensure target variable is categorical
    if y.dtype == 'float64' or y.dtype == 'int64':
        unique_values = y.nunique()
        if unique_values <= 20:  # arbitrary threshold to determine if the column is categorical
            y = y.astype('category')
        else:
            raise ValueError("Target variable seems to be continuous, cannot perform classification.")

    # Convert target variable to numeric if it's categorical
    if y.dtype.name == 'category':
        le = LabelEncoder()
        y = le.fit_transform(y)

    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the model
    model = RandomForestClassifier(random_state=42)

    # Define oversampling and undersampling strategy
    over = SMOTE(sampling_strategy='auto', random_state=42)
    under = RandomUnderSampler(sampling_strategy='auto', random_state=42)

    # Create a pipeline with oversampling and undersampling
    steps = [('o', over), ('u', under), ('m', model)]
    pipeline = Pipeline(steps=steps)

    # Define hyperparameters for Grid Search
    param_grid = {
        'm__n_estimators': [100, 200],
        'm__max_depth': [10, 20, None],
        'm__min_samples_split': [2, 5, 10]
    }

    # Perform Grid Search with cross-validation
    grid_search = GridSearchCV(pipeline, param_grid, scoring='f1_weighted', cv=5, error_score='raise')
    grid_search.fit(X_train, y_train)

    # Making predictions with the best model
    y_pred = grid_search.best_estimator_.predict(X_test)

    # Getting classification report
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    # Returning the classification report as a DataFrame
    return pd.DataFrame(report).transpose(), grid_search.best_params_

# Dictionary to hold results
results = {}

# Process each sheet separately
for sheet_name, data in sheets.items():
    try:
        print(f"Processing sheet: {sheet_name}")
        result, best_params = run_random_forest(data)
        results[sheet_name] = (result, best_params)
    except ValueError as e:
        print(f"Skipping sheet {sheet_name} due to error: {e}")
    except Exception as e:
        print(f"Skipping sheet {sheet_name} due to unexpected error: {e}")

# Save results to a new Excel file
with pd.ExcelWriter('results.xlsx') as writer:
    for sheet_name, (result, best_params) in results.items():
        result.to_excel(writer, sheet_name=sheet_name)
        # Save best parameters as an additional sheet
        params_df = pd.DataFrame([best_params])
        params_df.to_excel(writer, sheet_name=f'{sheet_name}_params')

print("Processing complete. Results saved to 'results.xlsx'.")



Processing sheet: Original Data
Processing sheet: Original Data (2)
Processing sheet: Transposed
Skipping sheet Transposed due to error: Target variable seems to be continuous, cannot perform classification.
Processing sheet: Topological
Processing sheet: Geometric
Processing sheet: Fingerprint
Processing sheet: Constitutional
Processing complete. Results saved to 'results.xlsx'.


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the "Geometric" sheet from the Excel file
file_path = 'Padel 1.xlsx'  # Replace with your file path
geometric_data = pd.read_excel(file_path, sheet_name='Geometric')

# Preprocess the data
# Assume the last column is the target variable, and the rest are features
X = geometric_data.iloc[:, :-1]  # Features
y = geometric_data.iloc[:, -1]   # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Optionally, you can print feature importances
feature_importances = pd.Series(clf.feature_importances_, index=X.columns)
print(feature_importances.sort_values(ascending=False))


ValueError: could not convert string to float: 'M41'

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the "Geometric" sheet from the Excel file
file_path = 'Padel 1.xlsx'  # Replace with your file path
geometric_data = pd.read_excel(file_path, sheet_name='Geometric')

# Preprocess the data
# Assume the last column is the target variable, and the rest are features
# EXCLUDE THE 'Name' COLUMN AS IT CONTAINS NON-NUMERIC DATA
X = geometric_data.iloc[:, 1:-1]  # Features (excluding the first column 'Name')
y = geometric_data.iloc[:, -1]   # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Optionally, you can print feature importances
feature_importances = pd.Series(clf.feature_importances_, index=X.columns)
print(feature_importances.sort_values(ascending=False))

Accuracy: 0.56
PEOE_VSA9       0.061926
VSA_EState8     0.044844
VSA_EState1     0.033103
PEOE_VSA12      0.032693
VSA_EState4     0.029386
EState_VSA5     0.028105
SMR_VSA10       0.027707
PEOE_VSA7       0.025808
VSA_EState3     0.024745
PEOE_VSA8       0.024482
VSA_EState5     0.024384
EState_VSA7     0.023458
SMR_VSA3        0.022532
VSA_EState6     0.022163
SMR_VSA1        0.022006
EState_VSA3     0.021809
EState_VSA1     0.021528
SlogP_VSA3      0.021410
PEOE_VSA2       0.021391
SlogP_VSA5      0.021359
EState_VSA8     0.021093
LabuteASA       0.019643
SlogP_VSA8      0.019163
VSA_EState7     0.019033
SMR_VSA7        0.018971
EState_VSA4     0.018859
SlogP_VSA6      0.018618
PEOE_VSA3       0.018307
SMR_VSA9        0.018210
PEOE_VSA11      0.017787
EState_VSA6     0.017514
EState_VSA10    0.016035
PEOE_VSA13      0.015023
PEOE_VSA10      0.014626
PEOE_VSA6       0.014158
PEOE_VSA4       0.014068
SlogP_VSA1      0.013263
VSA_EState2     0.013244
EState_VSA9     0.012919
EState_VSA

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Function to process a given sheet and apply Random Forest
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        le = LabelEncoder()
        y = le.fit_transform(y)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the Random Forest classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {sheet_name}: {accuracy:.2f}")

    # Optionally, you can print feature importances
    feature_importances = pd.Series(clf.feature_importances_, index=X.columns)
    print(f"Feature importances for {sheet_name}:\n{feature_importances.sort_values(ascending=False)}\n")

# File path to the Excel file named "Padel 1"
file_path = 'Padel 1.xlsx'  # Replace with your actual file path if different

# Process each specified sheet
sheets = ["Geometric", "Topological", "Fingerprint", "Constitutional"]
for sheet in sheets:
    process_sheet(sheet, file_path)



Processing sheet: Geometric
Accuracy for Geometric: 0.56
Feature importances for Geometric:
PEOE_VSA12     0.047010
PEOE_VSA9      0.043220
VSA_EState6    0.040633
VSA_EState8    0.038152
PEOE_VSA7      0.031984
                 ...   
Name_M36       0.000000
Name_K24       0.000000
Name_K16       0.000000
Name_M4        0.000000
Name_M38       0.000000
Length: 146, dtype: float64

Processing sheet: Topological
Accuracy for Topological: 0.50
Feature importances for Topological:
BalabanJ        0.045299
Kappa2          0.040469
TPSA            0.039099
BCUT2D_MRLOW    0.038703
Chi4v           0.036575
                  ...   
Name_M35        0.000000
Name_K15        0.000000
Name_M39        0.000000
Name_M4         0.000000
Name_M37        0.000000
Length: 118, dtype: float64

Processing sheet: Fingerprint
Accuracy for Fingerprint: 0.50
Feature importances for Fingerprint:
fr_bicyclic              0.065348
fr_NH0                   0.061460
fr_para_hydroxylation    0.052934
fr_Ar_N      

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Function to process a given sheet and apply Random Forest
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        le = LabelEncoder()
        y = le.fit_transform(y)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the Random Forest classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {sheet_name}: {accuracy:.2f}")

    # Get feature importances
    feature_importances = pd.Series(clf.feature_importances_, index=X.columns)

    return accuracy, feature_importances

# File path to the Excel file named "Padel 1"
file_path = 'Padel 1.xlsx'  # Replace with your actual file path if different

# List of sheets to process
sheets = ["Geometric", "Topological", "Fingerprint", "Constitutional"]

# Initialize lists to store results
accuracies = []
feature_importances_list = []

# Process each specified sheet
for sheet in sheets:
    accuracy, feature_importances = process_sheet(sheet, file_path)
    accuracies.append({'Sheet': sheet, 'Accuracy': accuracy})
    feature_importances_list.append(feature_importances)

# Create a DataFrame for accuracies
accuracies_df = pd.DataFrame(accuracies)

# Create a dictionary of DataFrames for feature importances
feature_importances_dfs = {sheet: fi for sheet, fi in zip(sheets, feature_importances_list)}

# Write the results to an Excel file
with pd.ExcelWriter('RandomForest_Results.xlsx') as writer:
    accuracies_df.to_excel(writer, sheet_name='Accuracies', index=False)
    for sheet, fi_df in feature_importances_dfs.items():
        fi_df.to_frame(name='Importance').to_excel(writer, sheet_name=f'{sheet}_Feature_Importances')


Processing sheet: Geometric
Accuracy for Geometric: 0.56
Processing sheet: Topological
Accuracy for Topological: 0.50
Processing sheet: Fingerprint
Accuracy for Fingerprint: 0.50
Processing sheet: Constitutional
Accuracy for Constitutional: 0.56




In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Function to process a given sheet and apply Random Forest
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        le = LabelEncoder()
        y = le.fit_transform(y)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the Random Forest classifier
    clf = RandomForestClassifier(n_estimators=100, random_state=42)

    # Train the model
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {sheet_name}: {accuracy:.2f}")

    # Calculate feature importances
    feature_importances = pd.Series(clf.feature_importances_, index=X.columns)

    # Return the results
    return accuracy, feature_importances

# File path to the Excel file named "Padel 1"
file_path = 'Padel 1.xlsx'

# Process each specified sheet and collect results
sheets = ["Geometric", "Topological", "Fingerprint", "Constitutional"]
results = []

for sheet in sheets:
    accuracy, feature_importances = process_sheet(sheet, file_path)
    results.append({
        'sheet': sheet,
        'accuracy': accuracy,
        'feature_importances': feature_importances
    })

# Create a new Excel writer object
output_file_path = 'results.xlsx'
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    # Write accuracy results
    accuracies_df = pd.DataFrame({
        'Sheet': [result['sheet'] for result in results],
        'Accuracy': [result['accuracy'] for result in results]
    })
    accuracies_df.to_excel(writer, sheet_name='Accuracies', index=False)

    # Write feature importances for each sheet
    for result in results:
        feature_importances_df = result['feature_importances'].reset_index()
        feature_importances_df.columns = ['Feature', 'Importance']
        feature_importances_df.to_excel(writer, sheet_name=f"{result['sheet']} Importances", index=False)

print(f"Results saved to '{output_file_path}'")


In [None]:
pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12


In [None]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

# Function to process a given sheet and apply LazyPredict
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        y = pd.factorize(y)[0]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Train and evaluate multiple models
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Return the models performance
    return models

# File path to the Excel file named "Padel 1"
file_path = 'Padel 1.xlsx'  # Replace with your actual file path if different

# Process each specified sheet and collect results
sheets = ["Geometric", "Topological", "Fingerprint", "Constitutional"]
results = {}

for sheet in sheets:
    models = process_sheet(sheet, file_path)
    results[sheet] = models

# Create a new Excel writer object
output_file_path = 'results_lazy.xlsx'
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    # Write model performance results for each sheet
    for sheet, models in results.items():
        models_df = models.reset_index()
        models_df.to_excel(writer, sheet_name=sheet, index=False)

print(f"Results saved to '{output_file_path}'")


Processing sheet: Geometric


100%|██████████| 29/29 [00:03<00:00,  7.29it/s]

[LightGBM] [Info] Number of positive: 17, number of negative: 53
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 873
[LightGBM] [Info] Number of data points in the train set: 70, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242857 -> initscore=-1.137079
[LightGBM] [Info] Start training from score -1.137079
Processing sheet: Topological



100%|██████████| 29/29 [00:03<00:00,  8.91it/s]

[LightGBM] [Info] Number of positive: 17, number of negative: 53
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000079 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 736
[LightGBM] [Info] Number of data points in the train set: 70, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242857 -> initscore=-1.137079
[LightGBM] [Info] Start training from score -1.137079
Processing sheet: Fingerprint



100%|██████████| 29/29 [00:04<00:00,  6.97it/s]


[LightGBM] [Info] Number of positive: 17, number of negative: 53
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000072 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76
[LightGBM] [Info] Number of data points in the train set: 70, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242857 -> initscore=-1.137079
[LightGBM] [Info] Start training from score -1.137079
Processing sheet: Constitutional


100%|██████████| 29/29 [00:02<00:00, 12.50it/s]


[LightGBM] [Info] Number of positive: 17, number of negative: 53
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 565
[LightGBM] [Info] Number of data points in the train set: 70, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.242857 -> initscore=-1.137079
[LightGBM] [Info] Start training from score -1.137079
Results saved to 'results_lazy.xlsx'


In [None]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

# Function to process a given sheet and apply LazyPredict
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        y = pd.factorize(y)[0]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Train and evaluate multiple models
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Return the models performance
    return models

# File path to the Excel file named "Padel 1"
file_path = 'Padel 1.xlsx'  # Replace with your actual file path if different

# Process each specified sheet and collect results
sheets = ["Geometric", "Topological", "Fingerprint", "Constitutional"]
results = {}

for sheet in sheets:
    models = process_sheet(sheet, file_path)
    results[sheet] = models

# Create a new Excel writer object
output_file_path = 'results_lazy.xlsx'
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    # Write model performance results for each sheet
    for sheet, models in results.items():
        models_df = models.reset_index()
        models_df.to_excel(writer, sheet_name=sheet, index=False)

print(f"Results saved to '{output_file_path}'")

In [None]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

# Function to process a given sheet and apply LazyPredict
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        y = pd.factorize(y)[0]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

    # Initialize the LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Train and evaluate multiple models
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Return the models performance
    return models

# File path to the Excel file named "Padel 1"
file_path = 'Padel 1.xlsx'  # Replace with your actual file path if different

# Process each specified sheet and collect results
sheets = ["Geometric"]
results = {}

for sheet in sheets:
    models = process_sheet(sheet, file_path)
    results[sheet] = models

# Create a new Excel writer object
output_file_path = 'Georesults_lazy.xlsx'
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    # Write model performance results for each sheet
    for sheet, models in results.items():
        models_df = models.reset_index()
        models_df.to_excel(writer, sheet_name=sheet, index=False)

print(f"Results saved to '{output_file_path}'")


Processing sheet: Geometric


100%|██████████| 29/29 [00:04<00:00,  5.85it/s]

[LightGBM] [Info] Number of positive: 13, number of negative: 31
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 576
[LightGBM] [Info] Number of data points in the train set: 44, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.295455 -> initscore=-0.869038
[LightGBM] [Info] Start training from score -0.869038
Results saved to 'Georesults_lazy.xlsx'





In [None]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

# Function to process a given sheet and apply LazyPredict
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        y = pd.factorize(y)[0]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

    # Initialize the LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Train and evaluate multiple models
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Return the models performance
    return models

# File path to the Excel file named "Padel 1"
file_path = 'Padel 2.xlsx'  # Replace with your actual file path if different

# Process each specified sheet and collect results
sheets = ["RFE Data0.3", "RFE Data0.2"]
results = {}

for sheet in sheets:
    models = process_sheet(sheet, file_path)
    results[sheet] = models

# Create a new Excel writer object
output_file_path = 'newresults_RFE.xlsx'
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    # Write model performance results for each sheet
    for sheet, models in results.items():
        models_df = models.reset_index()
        models_df.to_excel(writer, sheet_name=sheet, index=False)

print(f"Results saved to '{RFE_file_path}'")


Processing sheet: RFE Data0.3


100%|██████████| 29/29 [00:01<00:00, 19.21it/s]


[LightGBM] [Info] Number of positive: 13, number of negative: 31
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 44, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.295455 -> initscore=-0.869038
[LightGBM] [Info] Start training from score -0.869038
Processing sheet: RFE Data0.2


100%|██████████| 29/29 [00:01<00:00, 27.83it/s]


[LightGBM] [Info] Number of positive: 13, number of negative: 31
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 101
[LightGBM] [Info] Number of data points in the train set: 44, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.295455 -> initscore=-0.869038
[LightGBM] [Info] Start training from score -0.869038


NameError: name 'RFE_file_path' is not defined

In [None]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

# Function to process a given sheet and apply LazyPredict
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        y, class_labels = pd.factorize(y)
    else:
        class_labels = y.unique()

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

    # Initialize the LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Train and evaluate multiple models
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Add class labels to the models DataFrame
    models['Class Labels'] = ', '.join(map(str, class_labels))

    # Return the models performance
    return models

# File path to the Excel file named "Padel 1"
file_path = 'Padel 2.xlsx'  # Replace with your actual file path if different

# Process each specified sheet and collect results
sheets = ["RFE Data0.3", "RFE Data0.2"]
results = {}

for sheet in sheets:
    models = process_sheet(sheet, file_path)
    results[sheet] = models

# Create a new Excel writer object
output_file_path = 'newresults_RFE.xlsx'
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    # Write model performance results for each sheet
    for sheet, models in results.items():
        models_df = models.reset_index()
        models_df.to_excel(writer, sheet_name=sheet, index=False)

print(f"Results saved to '{output_file_path}'")



Processing sheet: RFE Data0.3


100%|██████████| 29/29 [00:01<00:00, 26.15it/s]


[LightGBM] [Info] Number of positive: 13, number of negative: 31
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 44, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.295455 -> initscore=-0.869038
[LightGBM] [Info] Start training from score -0.869038
Processing sheet: RFE Data0.2


100%|██████████| 29/29 [00:01<00:00, 28.38it/s]


[LightGBM] [Info] Number of positive: 13, number of negative: 31
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 101
[LightGBM] [Info] Number of data points in the train set: 44, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.295455 -> initscore=-0.869038
[LightGBM] [Info] Start training from score -0.869038
Results saved to 'newresults_RFE.xlsx'


NameError: name 'y_test' is not defined

In [None]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score

# Function to process a given sheet and apply LazyPredict with tenfold cross-validation
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        y, class_labels = pd.factorize(y)
    else:
        class_labels = y.unique()

    # Initialize the LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Set up tenfold cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    fold_results = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train and evaluate multiple models
        models, predictions = clf.fit(X_train, X_test, y_train, y_test)

        # Add accuracy score to the models DataFrame
        accuracies = []
        for model in models.index:
            if model in predictions:
                accuracies.append(accuracy_score(y_test, predictions[model]))
            else:
                accuracies.append(None)  # Handle missing models
        models['Fold Accuracy'] = accuracies

        fold_results.append(models)

    # Aggregate results across folds
    fold_results_df = pd.concat(fold_results)
    aggregated_results = fold_results_df.groupby(fold_results_df.index).mean()

    # Add class labels to the aggregated models DataFrame
    aggregated_results['Class Labels'] = ', '.join(map(str, class_labels))

    # Return the aggregated models performance
    return aggregated_results

# File path to the Excel file named "Padel 1"
file_path = 'Padel 2.xlsx'  # Replace with your actual file path if different

# Process each specified sheet and collect results
sheets = ["RFE Data0.3", "RFE Data0.2"]
results = {}

for sheet in sheets:
    models = process_sheet(sheet, file_path)
    results[sheet] = models

# Create a new Excel writer object
output_file_path = 'Crossval_RFE.xlsx'
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    # Write model performance results for each sheet
    for sheet, models in results.items():
        models_df = models.reset_index()
        models_df.to_excel(writer, sheet_name=sheet, index=False)

print(f"Results saved to '{output_file_path}'")



Processing sheet: RFE Data0.3


100%|██████████| 29/29 [00:02<00:00, 12.25it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 137
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.79it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 137
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.71it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 134
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:00<00:00, 29.48it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 137
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 27.92it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000063 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 137
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 26.95it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 133
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 17.93it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 133
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 18.82it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000042 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 26.65it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 133
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287500 -> initscore=-0.907557
[LightGBM] [Info] Start training from score -0.907557
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 27.92it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 137
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287500 -> initscore=-0.907557
[LightGBM] [Info] Start training from score -0.907557
Processing sheet: RFE Data0.2


100%|██████████| 29/29 [00:01<00:00, 27.69it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 185
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.56it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 186
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.78it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.64it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 185
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.75it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.99it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 181
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:00<00:00, 29.07it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 180
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 22.28it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 17.84it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 181
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287500 -> initscore=-0.907557
[LightGBM] [Info] Start training from score -0.907557
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 21.05it/s]

[LightGBM] [Info] Number of positive: 23, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 186
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287500 -> initscore=-0.907557
[LightGBM] [Info] Start training from score -0.907557
Results saved to 'Crossval_RFE.xlsx'





In [None]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Function to process a given sheet and apply LazyPredict
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        y, class_labels = pd.factorize(y)
    else:
        class_labels = y.unique()

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

    # Initialize the LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Train and evaluate multiple models
    models, predictions = clf.fit(X_train, X_test, y_train, y_test)

    # Add class labels to the models DataFrame
    models['Class Labels'] = ', '.join(map(str, class_labels))

    # Return the models performance and predictions
    return models, y_test, predictions

# File path to the Excel file named "Padel 1"
file_path = 'Padel 2.xlsx'  # Replace with your actual file path if different

# Process each specified sheet and collect results
sheets = ["RFE Data0.3", "RFE Data0.2"]
results = {}
all_reports = {}

for sheet in sheets:
    models, y_test, predictions = process_sheet(sheet, file_path)
    results[sheet] = models

    # Get the classification report for each model
    report = classification_report(y_test, predictions, output_dict=True)
    all_reports[sheet] = pd.DataFrame(report).transpose()

# Create a new Excel writer object
output_file_path = 'newresults_RFE.xlsx'
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    # Write model performance results for each sheet
    for sheet, models in results.items():
        models_df = models.reset_index()
        models_df.to_excel(writer, sheet_name=f'{sheet}_Models', index=False)

    # Write classification reports for each sheet
    for sheet, report in all_reports.items():
        report.to_excel(writer, sheet_name=f'{sheet}_Report', index=True)

print(f"Results saved to '{output_file_path}'")


Processing sheet: RFE Data0.3


100%|██████████| 29/29 [00:01<00:00, 28.17it/s]

[LightGBM] [Info] Number of positive: 13, number of negative: 31
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 44, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.295455 -> initscore=-0.869038
[LightGBM] [Info] Start training from score -0.869038





ValueError: Found input variables with inconsistent numbers of samples: [44, 27]

In [None]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Function to process a given sheet and apply LazyPredict
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        y, class_labels = pd.factorize(y)
    else:
        class_labels = y.unique()

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

    # Initialize the LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Train and evaluate multiple models and obtain predictions
    models, predictions = clf.fit(X_train, X_test, y_train, y_test) # Obtain predictions directly from clf.fit

    # Add class labels to the models DataFrame
    models['Class Labels'] = ', '.join(map(str, class_labels))

    # Return the models performance and predictions
    return models, y_test, predictions # Return predictions obtained from clf.fit

# File path to the Excel file named "Padel 1"
file_path = 'Padel 2.xlsx'  # Replace with your actual file path if different

# Process each specified sheet and collect results
sheets = ["RFE Data0.3", "RFE Data0.2"]
results = {}
all_reports = {}

for sheet in sheets:
    models, y_test, predictions = process_sheet(sheet, file_path)
    results[sheet] = models

    # Get the classification report for each model
    report = classification_report(y_test, predictions, output_dict=True)
    all_reports[sheet] = pd.DataFrame(report).transpose()

# Create a new Excel writer object
output_file_path = 'newresults_RFE2.xlsx'
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    # Write model performance results for each sheet
    for sheet, models in results.items():
        models_df = models.reset_index()
        models_df.to_excel(writer, sheet_name=f'{sheet}_Models', index=False)

    # Write classification reports for each sheet
    for sheet, report in all_reports.items():
        report.to_excel(writer, sheet_name=f'{sheet}_Report', index=True)

print(f"Results saved to '{output_file_path}'")

Processing sheet: RFE Data0.3


100%|██████████| 29/29 [00:01<00:00, 18.38it/s]

[LightGBM] [Info] Number of positive: 13, number of negative: 31
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77
[LightGBM] [Info] Number of data points in the train set: 44, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.295455 -> initscore=-0.869038
[LightGBM] [Info] Start training from score -0.869038





ValueError: Found input variables with inconsistent numbers of samples: [44, 27]

In [None]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score

# Function to process a given sheet and apply LazyPredict with tenfold cross-validation
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        y, class_labels = pd.factorize(y)
    else:
        class_labels = y.unique()

    # Initialize the LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Set up tenfold cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    fold_results = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train and evaluate multiple models
        models, predictions = clf.fit(X_train, X_test, y_train, y_test)

        # Add accuracy score to the models DataFrame
        fold_accuracies = []
        for model in models.index:
            if model in predictions:
                fold_accuracies.append(accuracy_score(y_test, predictions[model]))
            else:
                fold_accuracies.append(None)  # Handle missing models
        models['Fold Accuracy'] = fold_accuracies

        fold_results.append(models)

    # Aggregate results across folds
    fold_results_df = pd.concat(fold_results)
    aggregated_results = fold_results_df.groupby(fold_results_df.index).mean()

    # Add class labels to the aggregated models DataFrame
    aggregated_results['Class Labels'] = ', '.join(map(str, class_labels))

    # Return the aggregated models performance
    return aggregated_results

# File path to the Excel file named "Padel 1"
file_path = 'Padel 2.xlsx'  # Replace with your actual file path if different

# Process each specified sheet and collect results
sheets = ["RFE Data0.3", "RFE Data0.2"]
results = {}

for sheet in sheets:
    models = process_sheet(sheet, file_path)
    results[sheet] = models

# Create a new Excel writer object
output_file_path = 'Crossval2_RFE.xlsx'
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    # Write model performance results for each sheet
    for sheet, models in results.items():
        models_df = models.reset_index()
        models_df.to_excel(writer, sheet_name=sheet, index=False)

print(f"Results saved to '{output_file_path}'")


Processing sheet: RFE Data0.3


100%|██████████| 29/29 [00:02<00:00, 14.26it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 137
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:02<00:00, 13.76it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 137
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 25.79it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 134
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 19.49it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 137
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 14.99it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 137
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.92it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 133
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.91it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 133
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.25it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 135
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.65it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 133
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287500 -> initscore=-0.907557
[LightGBM] [Info] Start training from score -0.907557
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:00<00:00, 29.12it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 137
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287500 -> initscore=-0.907557
[LightGBM] [Info] Start training from score -0.907557
Processing sheet: RFE Data0.2


100%|██████████| 29/29 [00:00<00:00, 29.04it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000048 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 185
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.71it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 186
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 27.55it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.85it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 185
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 15.00it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000050 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 184
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 18.70it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 181
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.16it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 180
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 28.69it/s]


[LightGBM] [Info] Number of positive: 22, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 182
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.278481 -> initscore=-0.952009
[LightGBM] [Info] Start training from score -0.952009
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 27.25it/s]


[LightGBM] [Info] Number of positive: 23, number of negative: 57
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 181
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287500 -> initscore=-0.907557
[LightGBM] [Info] Start training from score -0.907557
'tuple' object has no attribute '__name__'
Invalid Classifier(s)


100%|██████████| 29/29 [00:01<00:00, 27.99it/s]

[LightGBM] [Info] Number of positive: 23, number of negative: 57
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 186
[LightGBM] [Info] Number of data points in the train set: 80, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.287500 -> initscore=-0.907557
[LightGBM] [Info] Start training from score -0.907557
Results saved to 'Crossval2_RFE.xlsx'





In [None]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np

# Function to process a given sheet and apply LazyPredict with tenfold cross-validation
def process_sheet(sheet_name, file_path):
    print(f"Processing sheet: {sheet_name}")

    # Load the sheet from the Excel file
    data = pd.read_excel(file_path, sheet_name=sheet_name)

    # Separate features and target
    X = data.iloc[:, :-1]  # Features
    y = data.iloc[:, -1]   # Target variable

    # Encode categorical features
    X = pd.get_dummies(X)

    # Encode the target variable if it's categorical
    if y.dtype == 'object' or y.dtype.name == 'category':
        y, class_labels = pd.factorize(y)
    else:
        class_labels = y.unique()

    # Initialize the LazyClassifier
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

    # Set up tenfold cross-validation
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    fold_accuracies = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train and evaluate multiple models
        models, predictions = clf.fit(X_train, X_test, y_train, y_test)

        # Calculate accuracy for each model and store it
        fold_accuracy = {}
        for model in predictions.columns:
            accuracy = accuracy_score(y_test, predictions[model])
            if model not in fold_accuracy:
                fold_accuracy[model] = []
            fold_accuracy[model].append(accuracy)

        fold_accuracies.append(fold_accuracy)

    # Aggregate results across folds
    aggregated_results = pd.DataFrame()
    for model in fold_accuracies[0].keys():
        accuracies = [fold[model] for fold in fold_accuracies]
        mean_accuracy = np.mean(accuracies)
        aggregated_results = aggregated_results.append({
            'Model': model,
            'Fold Accuracy': mean_accuracy
        }, ignore_index=True)

    # Merge with LazyClassifier results
    models_summary = models.reset_index()
    results_with_folds = pd.merge(models_summary, aggregated_results, on='Model')

    # Add class labels to the aggregated models DataFrame
    results_with_folds['Class Labels'] = ', '.join(map(str, class_labels))

    # Return the aggregated models performance
    return results_with_folds

# File path to the Excel file
file_path = 'Padel 2.xlsx'  # Replace with your actual file path if different

# Process each specified sheet and collect results
sheets = ["RFE Data0.3", "RFE Data0.2"]
results = {}

for sheet in sheets:
    models = process_sheet(sheet, file_path)
    results[sheet] = models

# Create a new Excel writer object
output_file_path = 'Crossval3_RFE.xlsx'
with pd.ExcelWriter(output_file_path, engine='openpyxl') as writer:
    # Write model performance results for each sheet
    for sheet, models in results.items():
        models_df = models.reset_index()
        models_df.to_excel(writer, sheet_name=sheet, index=False)

print(f"Results saved to '{output_file_path}'")


Processing sheet: RFE Data0.3


100%|██████████| 29/29 [00:01<00:00, 18.89it/s]

[LightGBM] [Info] Number of positive: 23, number of negative: 56
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 137
[LightGBM] [Info] Number of data points in the train set: 79, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.291139 -> initscore=-0.889857
[LightGBM] [Info] Start training from score -0.889857





ValueError: Found input variables with inconsistent numbers of samples: [9, 27]

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

def load_sheet(sheet_name, file_path):
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    print(f'Loaded sheet: {sheet_name}, Shape: {df.shape}')
    return df

def process_sheet(sheet_name, file_path):
    df = load_sheet(sheet_name, file_path)

    # Identify non-numeric columns
    non_numeric_columns = df.select_dtypes(include=['object']).columns
    print(f'Non-numeric columns: {non_numeric_columns}')

    # Drop non-numeric columns
    df = df.drop(columns=non_numeric_columns)

    # Check if there are enough columns to proceed
    if df.shape[1] < 2:
        print(f'Sheet {sheet_name} skipped due to insufficient features.')
        return None

    # Assume the last column is the target
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    print(f'Features shape: {X.shape}, Target shape: {y.shape}')

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
    print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

    # Fit a model
    model = LinearRegression()
    model.fit(X_train, y_train)

    return model

file_path = 'Padel 2.xlsx'
xls = pd.ExcelFile(file_path)
sheets = xls.sheet_names

results = {}
for sheet in sheets:
    model = process_sheet(sheet, file_path)
    if model is not None:
        results[sheet] = model

print('Processing completed.')


Loaded sheet: Original Data, Shape: (88, 213)
Non-numeric columns: Index(['Name', 'smiles', 'Class Label'], dtype='object')
Features shape: (88, 209), Target shape: (88,)
X_train shape: (70, 209), y_train shape: (70,)
X_test shape: (18, 209), y_test shape: (18,)
Loaded sheet: DataSet, Shape: (88, 211)
Non-numeric columns: Index([], dtype='object')
Features shape: (88, 210), Target shape: (88,)
X_train shape: (70, 210), y_train shape: (70,)
X_test shape: (18, 210), y_test shape: (18,)
Loaded sheet: RFE Data0.3, Shape: (88, 11)
Non-numeric columns: Index([], dtype='object')
Features shape: (88, 10), Target shape: (88,)
X_train shape: (70, 10), y_train shape: (70,)
X_test shape: (18, 10), y_test shape: (18,)
Loaded sheet: RFE Data0.2, Shape: (88, 11)
Non-numeric columns: Index([], dtype='object')
Features shape: (88, 10), Target shape: (88,)
X_train shape: (70, 10), y_train shape: (70,)
X_test shape: (18, 10), y_test shape: (18,)
Loaded sheet: Transposed, Shape: (210, 90)
Non-numeric colu