In [48]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [38]:
#Load the dataset
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_original.data.features 
y = breast_cancer_wisconsin_original.data.targets 
  
# # metadata 
# print(breast_cancer_wisconsin_original.metadata) 
  
# # variable information 
# print(breast_cancer_wisconsin_original.variables) 


In [21]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Clump_thickness              699 non-null    int64  
 1   Uniformity_of_cell_size      699 non-null    int64  
 2   Uniformity_of_cell_shape     699 non-null    int64  
 3   Marginal_adhesion            699 non-null    int64  
 4   Single_epithelial_cell_size  699 non-null    int64  
 5   Bare_nuclei                  683 non-null    float64
 6   Bland_chromatin              699 non-null    int64  
 7   Normal_nucleoli              699 non-null    int64  
 8   Mitoses                      699 non-null    int64  
dtypes: float64(1), int64(8)
memory usage: 49.3 KB


In [22]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Class   699 non-null    int64
dtypes: int64(1)
memory usage: 5.6 KB


In [39]:
print("The number of null values in each column:\n", X.isnull().sum())

The number of null values in each column:
 Clump_thickness                 0
Uniformity_of_cell_size         0
Uniformity_of_cell_shape        0
Marginal_adhesion               0
Single_epithelial_cell_size     0
Bare_nuclei                    16
Bland_chromatin                 0
Normal_nucleoli                 0
Mitoses                         0
dtype: int64


In [40]:
# Find indices of rows with NaN values in 'Bare_nuclei' column
nan_indices = X[X['Bare_nuclei'].isnull()].index

In [41]:
X = X.dropna()
X.isnull().sum()

Clump_thickness                0
Uniformity_of_cell_size        0
Uniformity_of_cell_shape       0
Marginal_adhesion              0
Single_epithelial_cell_size    0
Bare_nuclei                    0
Bland_chromatin                0
Normal_nucleoli                0
Mitoses                        0
dtype: int64

In [42]:
#Drop the rows with same indices in the X
y = y.drop(nan_indices)

In [43]:
print("X shape: ", X.shape)
print("Y shape: ", y.shape)

X shape:  (683, 9)
Y shape:  (683, 1)


In [44]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
# Define the model
model = LogisticRegression(max_iter=200)

In [46]:
# Initialize RFE
n_features_to_select = 2 # Number of features to keep
rfe = RFE(estimator=model, n_features_to_select=n_features_to_select)

In [49]:
# Fit RFE
rfe.fit(X_train, y_train)

In [50]:
# Get selected features
selected_features = rfe.support_ # Boolean mask of selected features
ranking = rfe.ranking_ # Feature rankings (1 indicates selected features)

In [51]:
# Transform the dataset to include only selected features
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

In [52]:
# Train the model on the selected features
model.fit(X_train_rfe, y_train)

In [53]:
# Predict and evaluate
y_pred = model.predict(X_test_rfe)
accuracy = accuracy_score(y_test, y_pred)
print(f"Selected Features Mask: {selected_features}")
print(f"Feature Ranking: {ranking}")
print(f"Model Accuracy with Selected Features: {accuracy}")

Selected Features Mask: [False False  True False False False  True False False]
Feature Ranking: [3 8 1 5 7 4 1 6 2]
Model Accuracy with Selected Features: 0.8978102189781022
