In [17]:
#perform ETL
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Step 1: Extract
# Assuming the dataset is saved as 'breast_cancer.csv' in the working directory
data = pd.read_csv('/content/drive/MyDrive/Cancer_Data.csv')

# Step 2: Transform
# 1. Handle missing values for numerical columns
# Exclude non-numeric columns
numerical_columns = data.select_dtypes(include=['number']).columns

# Perform imputation for numerical columns
imputer = SimpleImputer(strategy='mean')
data_imputed_numerical = pd.DataFrame(imputer.fit_transform(data[numerical_columns]), columns=numerical_columns)

# Combine imputed numerical columns with non-numerical columns
data_imputed = pd.concat([data['diagnosis'], data_imputed_numerical], axis=1)

# 2. Convert data types if necessary
# Encode 'diagnosis' to numerical values
data_imputed['diagnosis'] = data_imputed['diagnosis'].map({'B': 0, 'M': 1})

# 3. Drop unnecessary columns
# Drop the 'id' column if it exists
if 'id' in data_imputed.columns:
    data_imputed.drop(columns=['id'], inplace=True)

# Drop the 'Unnamed: 32' column if it exists
if 'Unnamed: 32' in data_imputed.columns:
    data_imputed.drop(columns=['Unnamed: 32'], inplace=True)

# 4. Standardize features
# Separate features and target
X = data_imputed.drop(columns=['diagnosis'])
y = data_imputed['diagnosis']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create a new DataFrame with scaled features
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Combine scaled features with target
transformed_data = pd.concat([X_scaled_df, y.reset_index(drop=True)], axis=1)

# Step 3: Load
# Save the transformed data to a new CSV file
transformed_data.to_csv('/content/drive/MyDrive/transformed_breast_cancer.csv', index=False)

# Display the first few rows of the transformed dataset
transformed_data.head()


Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,2.255747,...,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015,1
1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,-0.868652,...,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119,1
2,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,0.939685,-0.398008,...,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391,1
3,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,4.910919,...,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501,1
4,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,-0.00956,-0.56245,...,-1.46677,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971,1


In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Load the transformed dataset
transformed_data = pd.read_csv('/content/drive/MyDrive/transformed_breast_cancer.csv')

# Separate features (X) and target (y)
X = transformed_data.drop(columns=['diagnosis'])
y = transformed_data['diagnosis']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors as needed
knn.fit(X_train, y_train)

# Predict on the test set
y_pred = knn.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred, target_names=['Benign', 'Malignant'])

# Output the classification report
print(report)


              precision    recall  f1-score   support

      Benign       0.96      0.96      0.96        71
   Malignant       0.93      0.93      0.93        43

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [21]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Ask the user to input values for each feature
print("Please enter values for each feature:")
user_input = []
for feature in X.columns:
    value = input(f"{feature}: ")
    user_input.append(float(value))  # Convert input to float

# Convert the user input into a format suitable for prediction
user_input = np.array(user_input).reshape(1, -1)  # Reshape into a 2D array

# Scale the user input using the same scaler used during training
user_input_scaled = scaler.transform(user_input)

# Predict using the trained model
prediction = knn.predict(user_input_scaled)[0]

# Convert prediction to diagnosis
diagnosis = 'Malignant' if prediction == 1 else 'Benign'

# Output the prediction
print("Predicted Diagnosis:", diagnosis)

Please enter values for each feature:
radius_mean: 17.99
texture_mean: 10.38
perimeter_mean: 121.1
area_mean: 1200
smoothness_mean: 0.1184
compactness_mean: 0.21
concavity_mean: 0.31
concave points_mean: 0.13
symmetry_mean: 0.2419
fractal_dimension_mean: 0.07871
radius_se: 1.095
texture_se: 0.91
perimeter_se: 8.6
area_se: 154
smoothness_se: 0.006399
compactness_se: 0.049
concavity_se: 0.056
concave points_se: 0.01587
symmetry_se: 0.03004
fractal_dimension_se: 0.0061
radius_worst: 25.38
texture_worst: 17.33
perimeter_worst: 184.5
area_worst: 2019
smoothness_worst: 0.167
compactness_worst: 0.6667
concavity_worst: 0.76
concave points_worst: 0.2654
symmetry_worst: 0.46
fractal_dimension_worst: 0.119
Predicted Diagnosis: Malignant


