In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv("C:/Users/sreya/Downloads/Dataset/CANCER.csv")

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform the 'name' column into TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(data['name'])

# Target variable
y = data['suitable_for_cancer']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Individual models
log_clf = LogisticRegression(random_state=42)
rf_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42)

# Ensemble model using VotingClassifier
ensemble_model = VotingClassifier(
    estimators=[
        ('lr', log_clf),
        ('rf', rf_clf),
        ('svc', svm_clf)
    ],
    voting='soft'  # 'soft' voting considers predicted probabilities, which is often more accurate
)

# Model Training
ensemble_model.fit(X_train, y_train)

# Prediction
y_pred = ensemble_model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Ensembled Model Accuracy: {accuracy:.2f}")


Ensembled Model Accuracy: 0.82
