In [59]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

class RandomForestClassifier:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_estimators):
            bootstrap_indices = np.random.choice(len(X), size=len(X), replace=True)
            X_sampled = X.iloc[bootstrap_indices]
            y_sampled = y.iloc[bootstrap_indices]

            tree = DecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                random_state=42
            )
            tree.fit(X_sampled, y_sampled)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.round(np.mean(predictions, axis=0))

# Read the dataset
# df = pd.read_csv('Financial_Coverage-2.csv')

# # Perform one-hot encoding for categorical variables
# df_encoded = pd.get_dummies(df)

# # Separate features (X) and target variable (y)
# X = df_encoded.drop(columns=['smoker_yes'], axis=1)  # Remove the target variable column
# y = df_encoded['smoker_yes']  # Target variable

# # Perform cross-validation
# n_splits = 6
# indices = np.arange(len(X))
# np.random.shuffle(indices)

# fold_indices = np.array_split(indices, n_splits)

# accuracies = []

# for i in range(n_splits):
#     test_indices = fold_indices[i]
#     train_indices = np.concatenate(fold_indices[:i] + fold_indices[i+1:])

#     X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
#     y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

#     # Train the random forest classifier
#     n_trees = 100
#     random_forest = RandomForestClassifier(n_estimators=n_trees)
#     random_forest.fit(X_train, y_train)

#     # Make predictions
#     rf_predictions = random_forest.predict(X_test)

#     # Calculate accuracy
#     accuracy = accuracy_score(y_test, rf_predictions)
#     accuracies.append(accuracy)

# # Print the average accuracy over all folds
# print("Average Accuracy:", np.mean(accuracies))



###for dlbcl data
from sklearn.model_selection import train_test_split

# Read the dataset
df = pd.read_csv('DLBCL-2.csv')

# Split features and target variable
X = df.drop(columns=['target'], axis=1)
y = df['target']

# Perform one-hot encoding on the target variable
y_encoded = pd.get_dummies(y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Initialize the random forest classifier
n_trees = 100
random_forest = RandomForestClassifier(n_estimators=n_trees)

# Fit the classifier
random_forest.fit(X_train, y_train)

# Make predictions on the test set
rf_predictions = random_forest.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, rf_predictions)
print("Accuracy:", accuracy)



Accuracy: 0.9565217391304348


In [51]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, rf_predictions)
tp = cm[0,0]
fp = cm[0,1]
fn = cm[1,0]
tn = cm[1,1]
accuracy = (tp+tn)/(tp+fp+fn+tn)
print(accuracy)

1.0
