#### Import the functions

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Data Preprocessing

The dataset is loaded, unnecessary columns are dropped, missing values are checked, and the diagnosis column is encoded.

#### Loading the dataset

In [2]:
file_path = 'data.csv'
data = pd.read_csv(file_path)

#### Drop unnecessary columns

In [3]:
data = data.drop(['id', 'Unnamed: 32'], axis=1)

#### Check for missing values

In [4]:
missing_values = data.isnull().sum()
print("Missing Values:")
print(missing_values[missing_values > 0])

Missing Values:
Series([], dtype: int64)


#### Encoding the 'diagnosis' column: Malignant (M) as 1 and Benign (B) as 0

In [5]:
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B':0})

#### Displaying the first few rows of the dataset

In [6]:
print("\nFirst few rows of the dataset:")
print(data.head())


First few rows of the dataset:
   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0          1        17.99         10.38          122.80     1001.0   
1          1        20.57         17.77          132.90     1326.0   
2          1        19.69         21.25          130.00     1203.0   
3          1        11.42         20.38           77.58      386.1   
4          1        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
0         0.2

## Normalization and Data Splitting

The features are normalized, and the dataset is split into training and testing sets.

#### Normalize the features

In [8]:
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

#### Splitting the dataset into training and testing sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

#### Displaying the shapes of the splits

In [10]:
print("\nShapes of the Training and Testing Sets:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


Shapes of the Training and Testing Sets:
X_train shape: (455, 30)
X_test shape: (114, 30)
y_train shape: (455,)
y_test shape: (114,)


## Logistic Regression Model

Initializing, training, and evaluating the Logistic Regression model.

#### Initialize and train the Logistic Regression model

In [11]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

#### Predicting annd evaluating the model

In [12]:
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred) * 100
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

print("Logistic Regression Model Evaluation")
print(f"Accuracy: {accuracy:.2f}%")
print("\\nClassification Report:\\n", classification_rep)
print("\\nConfusion Matrix:\\n", confusion_mat)

Logistic Regression Model Evaluation
Accuracy: 97.37%
\nClassification Report:\n               precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

\nConfusion Matrix:\n [[70  1]
 [ 2 41]]


## Hyperparameter Tuning and Cross-Validation

Fine-tuning the model's hyperparameters and validating its performance with cross-validation.

#### Define the range of hyperparameters for tuning

In [13]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], 
    'solver': ['lbfgs', 'liblinear']
}

#### Perform GridSearchCV with cross-validation

In [14]:
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

#### Displaying the best hyperparameters

In [15]:
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'C': 0.1, 'solver': 'liblinear'}


#### Training with the best parameters

In [16]:
log_reg = LogisticRegression(**best_params)
log_reg.fit(X_train, y_train)

#### Cross-validation scores

In [18]:
cv_scores = cross_val_score(log_reg, X_normalized, y, cv=5) * 100
print("Cross-Validation Accuracy Scores:")
for i, score in enumerate(cv_scores):
    print(f" Fold {i + 1}: {score:.2f}%")
print(f"Mean CV Accuracy: {cv_scores.mean():.2f}%")

Cross-Validation Accuracy Scores:
 Fold 1: 98.25%
 Fold 2: 98.25%
 Fold 3: 99.12%
 Fold 4: 97.37%
 Fold 5: 98.23%
Mean CV Accuracy: 98.24%
