In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mode
from sklearn.metrics import accuracy_score

## Part 1

In [None]:
df = pd.read_csv("cleveland.csv")

In [None]:
df = df.rename({"num": "disease"}, axis=1)
df["disease"] = df["disease"].apply(lambda x: min(x, 1))
df
df.replace("?", pd.NA, inplace=True)
df = df.dropna()
df

### Standardization

In [None]:
df["thalach"] = (df.thalach - df.thalach.mean()) / df.thalach.std()
df["exang"] = (df.exang - df.exang.mean()) / df.exang.std()
df["oldpeak"] = (df.oldpeak - df.oldpeak.mean()) / df.oldpeak.std()
df["ca"] = df["ca"].astype("float")
df["thal"] = df["thal"].astype("float")
df["ca"] = (df.ca - df.ca.mean()) / df.ca.std()
df["thal"] = (df.thal - df.thal.mean()) / df.thal.std()
df["age"] = (df.age - df.age.mean()) / df.age.std()
df["sex"] = (df.sex - df.sex.mean()) / df.sex.std()
df["cp"] = (df.cp - df.cp.mean()) / df.cp.std()
df["trestbps"] = (df.trestbps - df.trestbps.mean()) / df.trestbps.std()
df["chol"] = (df.chol - df.chol.mean()) / df.chol.std()
df["fbs"] = (df.fbs - df.fbs.mean()) / df.fbs.std()
df["restecg"] = (df.restecg - df.restecg.mean()) / df.restecg.std()
df["slope"] = (df.slope - df.slope.mean()) / df.slope.std()

### Let's try the correlation plot to see which features have a great slope value

In [None]:
fig, axes = plt.subplots(3, 5, figsize=(35, 13), sharex=False)
# sns.set_context("notebook", font_scale=1.5)
fig.suptitle('Correlation between features and disease', fontsize=20)
sns.regplot(x='disease', y='age', data=df, ax=axes[0,0])
axes[0,0].set_xlabel("age", fontsize=20)
axes[0,0].set_ylabel("disease", fontsize=20)

sns.regplot(x='disease', y='cp', data=df, ax=axes[0,1])
axes[0,1].set_xlabel("cp", fontsize=20)
axes[0,1].set_ylabel("disease", fontsize=20)

sns.regplot(x='disease', y='trestbps', data=df, ax=axes[0,2])
axes[0,2].set_xlabel("trestbps", fontsize=20)
axes[0,2].set_ylabel("disease", fontsize=20)

sns.regplot(x='disease', y='chol', data=df, ax=axes[0,3])
axes[0,3].set_xlabel("chol", fontsize=20)
axes[0,3].set_ylabel("disease", fontsize=20)

sns.regplot(x='disease', y='sex', data=df, ax=axes[0,4])
axes[0,4].set_xlabel("sex", fontsize=20)
axes[0,4].set_ylabel("disease", fontsize=20)

sns.regplot(x='disease', y='fbs', data=df, ax=axes[1,0])
axes[1,0].set_xlabel("fbs", fontsize=20)
axes[1, 0].set_ylabel("disease", fontsize=20)

sns.regplot(x='disease', y='restecg', data=df, ax=axes[1,1])
axes[1,1].set_xlabel("restecg", fontsize=20)
axes[1,1].set_ylabel("disease", fontsize=20)

sns.regplot(x='disease', y='thalach', data=df, ax=axes[1,2])
axes[1,2].set_xlabel("thalach", fontsize=20)
axes[1,2].set_ylabel("disease", fontsize=20)

sns.regplot(x='disease', y='exang', data=df, ax=axes[1,3])
axes[1,3].set_xlabel("exang", fontsize=20)
axes[1,3].set_ylabel("disease", fontsize=20)

sns.regplot(x='disease', y='oldpeak', data=df, ax=axes[1,4])
axes[1,4].set_xlabel("oldpeak", fontsize=20)
axes[1,4].set_ylabel("disease", fontsize=20)

sns.regplot(x='disease', y='slope', data=df, ax=axes[2,0])
axes[2, 0].set_xlabel("slope", fontsize=20)
axes[2, 0].set_ylabel("disease", fontsize=20)

sns.regplot(x='disease', y='ca', data=df, ax=axes[2,1])
axes[2, 1].set_xlabel("ca", fontsize=20)
axes[2, 1].set_ylabel("disease", fontsize=20)
sns.regplot(x='disease', y='thal', data=df, ax=axes[2,2])
axes[2, 2].set_xlabel("thal", fontsize=20)
axes[2, 2].set_ylabel("disease", fontsize=20)
fig.delaxes(axes[2, 3])
fig.delaxes(axes[2, 4])

### Finding the best features based on correlation

In [None]:
features = df.iloc[:, :-1]

# Extract the class label (last column)
class_label = df.iloc[:, -1]

slopes = {}

# Calculate slope between each feature and the class label
for feature_name, feature_data in features.items():
    slope, _ = np.polyfit(feature_data, class_label, 1)
    slopes[feature_name] = (slope)
slopes_sorted = dict(sorted(slopes.items(), key=lambda item: item[1], reverse=True))
print(slopes_sorted)

#### Based on the charts the features: thalach, exang, oldpeak, ca, and thal have the biggest abstract correlation slope value. 

### Distribution of the features over class label

In [None]:
df.columns

In [None]:
disease_1 = df[df.disease==1]
disease_0 = df[df.disease==0]
figs, axes = plt.subplots(2, 7, figsize=(40, 10), sharex=False)
figs.suptitle('Distribution plot of each feature', fontsize=30)
sns.distplot(disease_1.age, label='D', kde_kws={'bw':0.5}, ax=axes[0,0], bins=5)
sns.distplot(disease_0.age, label='H', kde_kws={'bw':0.5}, ax=axes[0,0], bins=5)
axes[0,0].set_xlabel("age", fontsize=20)
axes[0,0].set_ylabel("Density", fontsize=20)

sns.distplot(disease_1.sex, label='D', kde_kws={'bw':1}, ax=axes[0,1], bins=5)
sns.distplot(disease_0.sex, label='H', kde_kws={'bw':1}, ax=axes[0,1], bins=5)
axes[0,1].set_xlabel("sex", fontsize=20)
axes[0,1].set_ylabel("Density", fontsize=20)

sns.distplot(disease_1.cp, label='D', kde_kws={'bw':0.5}, ax=axes[1,0], bins=5)
sns.distplot(disease_0.cp, label='H', kde_kws={'bw':0.5}, ax=axes[1,0], bins=5)
axes[1,0].set_xlabel("cp", fontsize=20)
axes[1,0].set_ylabel("Density", fontsize=20)

sns.distplot(disease_1.trestbps, label='D', kde_kws={'bw':0.5}, ax=axes[1,1], bins=5)
sns.distplot(disease_0.trestbps, label='H', kde_kws={'bw':0.5}, ax=axes[1,1], bins=5)
axes[1,1].set_xlabel("trestbps", fontsize=20)
axes[1,1].set_ylabel("Density", fontsize=20)

sns.distplot(disease_1.chol, label='D', kde_kws={'bw':0.5}, ax=axes[0,2], bins=5)
sns.distplot(disease_0.chol, label='H', kde_kws={'bw':0.5}, ax=axes[0,2], bins=5)
axes[0,2].set_xlabel("chol", fontsize=20)
axes[0,2].set_ylabel("Density", fontsize=20)


sns.distplot(disease_1.fbs, label='D', kde_kws={'bw':1}, ax=axes[1,2], bins=5)
sns.distplot(disease_0.fbs, label='H', kde_kws={'bw':1}, ax=axes[1,2], bins=5)
axes[1,2].set_xlabel("fbs", fontsize=20)
axes[1,2].set_ylabel("Density", fontsize=20)

sns.distplot(disease_1.restecg, label='D', kde_kws={'bw':1}, ax=axes[0,3], bins=5)
sns.distplot(disease_0.restecg, label='H', kde_kws={'bw':1}, ax=axes[0,3], bins=5)
axes[0,3].set_xlabel("restecg", fontsize=20)
axes[0,3].set_ylabel("Density", fontsize=20)

sns.distplot(disease_1.thalach, label='D', kde_kws={'bw':0.5}, ax=axes[1,3], bins=5)
sns.distplot(disease_0.thalach, label='H', kde_kws={'bw':0.5}, ax=axes[1,3], bins=5)
axes[1,3].set_xlabel("thalach", fontsize=20)
axes[1,3].set_ylabel("Density", fontsize=20)

sns.distplot(disease_1.exang, label='D', kde_kws={'bw':1}, ax=axes[0,4], bins=5)
sns.distplot(disease_0.exang, label='H', kde_kws={'bw':1}, ax=axes[0,4], bins=5)
axes[0,4].set_xlabel("exang", fontsize=20)
axes[0,4].set_ylabel("Density", fontsize=20)

sns.distplot(disease_1.oldpeak, label='D', kde_kws={'bw':0.5}, ax=axes[1,4], bins=5)
sns.distplot(disease_0.oldpeak, label='H', kde_kws={'bw':0.5}, ax=axes[1,4], bins=5)
axes[1,4].set_xlabel("oldpeak", fontsize=20)
axes[1,4].set_ylabel("Density", fontsize=20)

sns.distplot(disease_1.slope, label='D', kde_kws={'bw':1}, ax=axes[0,5], bins=5)
sns.distplot(disease_0.slope, label='H', kde_kws={'bw':1}, ax=axes[0,5], bins=5)
axes[0, 5].set_xlabel("slope", fontsize=20)
axes[0, 5].set_ylabel("Density", fontsize=20)

sns.distplot(disease_1.ca, label='D', kde_kws={'bw':0.5}, ax=axes[1,5], bins=5)
sns.distplot(disease_0.ca, label='H', kde_kws={'bw':0.5}, ax=axes[1,5], bins=5)
axes[1, 5].set_xlabel("ca", fontsize=20)
axes[1, 5].set_ylabel("Density", fontsize=20)

sns.distplot(disease_1.thal, label='D', kde_kws={'bw':1}, ax=axes[0,6], bins=5)
sns.distplot(disease_0.thal, label='H', kde_kws={'bw':1}, ax=axes[0,6], bins=5)
axes[0, 6].set_xlabel("thal", fontsize=20)
axes[0, 6].set_ylabel("Density", fontsize=20)
figs.delaxes(axes[1, 6])


### Calculated T-test to see the different groups {0, 1} per each feature


In [None]:
from scipy.stats import ttest_ind
significant_results = {}

for feature in df.columns[:-1]: 
    t_statistic, p_value = ttest_ind(disease_1[feature], disease_0[feature])
    
    
    if p_value < 0.05:
        # Store the t-statistic and p-value in the dictionary
        significant_results[feature] = {'T-Statistic': t_statistic, 'P-Value': p_value}

print(significant_results)

### Selecting our features for further analysis

In [None]:

X_first_time = df[['thalach', 'oldpeak', 'ca', 'thal']].values
X = df[['thalach', 'exang', 'oldpeak', 'ca', 'thal']].values

y = df["disease"].values



## k-nearest neighbors function

In [None]:
def knn(n_neighbors, X_train, y_train, X_test, y_test):
    nn = NearestNeighbors(n_neighbors=n_neighbors, metric="euclidean", algorithm="auto")

    fit = nn.fit(X_train)

    distances, indices = fit.kneighbors(X_test)
    y_pred = []

    # print(list(y_test))

    # for i in range(len(X_test)):
    #     zeros = list(y_train.iloc[indices[i]]).count(0)
    #     ones = list(y_train.iloc[indices[i]]).count(1)
    #     if ones > zeros:
    #         y_pred.append(1)
    #     else:
    #         y_pred.append(0)
    for i in range(len(X_test)):
        zeros = list(y_train[indices[i]]).count(0)
        ones = list(y_train[indices[i]]).count(1)
        # In case of equality of numbers we predict one
        if ones >= zeros:
            y_pred.append(1)
        else:
            y_pred.append(0)
    
    
    (p,r,f,s) = precision_recall_fscore_support(y_test, y_pred, labels=[0,1])
    acc = accuracy_score(y_test, y_pred)
    return [p, r, f, s, acc]
    

# Finding the best K

In [None]:
# For a given value of k, use a 10-fold cross validation to compute an average f-score
def averageFScore(kValue, X, y):
    
    k_fold = 10

    # Calculate the number of samples per fold
    fold_size = len(X) // k_fold
    
    # Shuffle the dataset
    X = np.array(X)
    y = np.array(y)

    # Shuffle the indices array
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    # Use the shuffled indices to shuffle both X and y
    X_shuffled = X[indices]
    y_shuffled = y[indices]
    f_score_1 = []
    

    for fold in range(k_fold):
        # Split the dataset into training and testing sets for this fold
        test_start = fold * fold_size
        test_end = (fold + 1) * fold_size
        X_test_fold = X_shuffled[test_start:test_end]
        y_test_fold = y_shuffled[test_start:test_end]


        # Use the remaining data as training set
        X_train_fold = np.concatenate([X_shuffled[:test_start], X_shuffled[test_end:]])
        y_train_fold = np.concatenate([y_shuffled[:test_start], y_shuffled[test_end:]])
        result = knn(kValue, X_train_fold, y_train_fold, X_test_fold, y_test_fold)
        # print(result)
        f_score_1.append(result[2][1])
    
    return np.mean(f_score_1)

k_range = range(1, 51)
best_Ks = []
plt.figure(figsize=(10, 5))
# Find 3 best k values, take average
for i in range(2):
    # For k-values 1-50, compute an average f-score and take the k with the highest average f-score
    f_scores = [averageFScore(kValue, X, y) for kValue in k_range]
    best_Ks.append(f_scores.index(max(f_scores))+1)
    plt.plot(k_range,f_scores)
print(f"best_k without average {max(best_Ks)}")
# best_Ksss = best_Ks
f_scores = f_scores
best_K = np.average(best_Ks)

plt.title("Average f-score vs. K value", fontsize=20)
plt.xlabel("K", fontsize=20)
plt.ylabel("f-score", fontsize=20)
plt.show()

best_K = int(best_K)
print(f"Average best K: {best_K}")
sns.displot(data=best_Ks,kde=True)
plt.title("Distribution of Best K Values")
plt.xlabel("K Value")
plt.ylabel("Count")




In the file `FindBestK.py`, there is code which runs a 10-fold cross-validation 1000 times. We will use the best k found by that code.

In [None]:
# From the results of multiple running, set best_K to 32
best_K = 32

### For the top 4 selected features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_first_time, y, test_size=.2)
result = knn(best_K, X_train, y_train, X_test, y_test)
print(f'precision={result[0]}, recall={result[1]}, f-score={result[2]}, support={result[3]}') 
print(f'accuracy={result[4]}') 


### For the top 5 selected features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
result = knn(best_K, X_train, y_train, X_test, y_test)
print(f'precision={result[0]}, recall={result[1]}, f-score={result[2]}, support={result[3]}') 
print(f'accuracy={result[4]}') 


### Using 10-fold Cross Validation for Evaluation of Model with Best K

In [None]:
k_fold = 10

# Calculate the number of samples per fold
fold_size = len(X) // k_fold

# Shuffle the dataset
X = np.array(X)
y = np.array(y)

# Shuffle the indices array
indices = np.arange(len(X))
np.random.shuffle(indices)

# Use the shuffled indices to shuffle both X and y
X_shuffled = X[indices]
y_shuffled = y[indices]

precision_0 = []
precision_1 = []
recall_0 = []
recall_1 = []
f_score_0 = []
f_score_1 = []
support_0 = []
support_1 = []
accuracy = []

for fold in range(k_fold):
    # Split the dataset into training and testing sets for this fold
    test_start = fold * fold_size
    test_end = (fold + 1) * fold_size
    X_test_fold = X_shuffled[test_start:test_end]
    y_test_fold = y_shuffled[test_start:test_end]
    
    
    # Use the remaining data as training set
    X_train_fold = np.concatenate([X_shuffled[:test_start], X_shuffled[test_end:]])
    y_train_fold = np.concatenate([y_shuffled[:test_start], y_shuffled[test_end:]])
    result = knn(best_K, X_train_fold, y_train_fold, X_test_fold, y_test_fold)
    # print(result)
    precision_0.append(result[0][0])
    precision_1.append(result[0][1])
    recall_0.append(result[1][0])
    recall_1.append(result[1][1])
    f_score_0.append(result[2][0])
    f_score_1.append(result[2][1])
    support_0.append(result[3][0])
    support_1.append(result[3][1])
    accuracy.append(result[4])
print("--- Recall ---")
print(f"recall_0 -> {(recall_0)}")
print(f"mean recall_0 -> {np.mean(recall_0)}")
print(f"recall_1 -> {(recall_1)}")
print(f"mean recall_1 -> {np.mean(recall_1)}")
print("\n--- Precision ---")
print(f"precision_0 -> {(precision_0)}")
print(f"mean precision_0 -> {np.mean(precision_0)}")
print(f"precision_1 -> {(precision_1)}")
print(f"mean precision_1 -> {np.mean(precision_1)}")
print("\n--- F-Score ---")
print(f"f-score_0 -> {(f_score_0)}")
print(f"mean f-score_0 -> {np.mean(f_score_0)}")
print(f"f-score_1 -> {(f_score_1)}")
print(f"mean f-score_1 -> {np.mean(f_score_1)}")
print(f"\n--- Accuracy ---")
print(f"accuracy for 1 -> {accuracy}")
print(f"mean accuracy for 1 -> {np.mean(accuracy)}")
print("\n--- Support ---")
print(f"support 0 -> {support_0}")
print(f"support 1 -> {support_1}")
    

In [None]:
k_fold = 10

# Calculate the number of samples per fold
fold_size = len(X_first_time) // k_fold

# Shuffle the dataset
X_first_time = np.array(X_first_time)
y = np.array(y)

# Shuffle the indices array
indices = np.arange(len(X_first_time))
np.random.shuffle(indices)

# Use the shuffled indices to shuffle both X and y
X_shuffled = X_first_time[indices]
y_shuffled = y[indices]

precision_0 = []
precision_1 = []
recall_0 = []
recall_1 = []
f_score_0 = []
f_score_1 = []
support_0 = []
support_1 = []
accuracy = []

for fold in range(k_fold):
    # Split the dataset into training and testing sets for this fold
    test_start = fold * fold_size
    test_end = (fold + 1) * fold_size
    X_test_fold = X_shuffled[test_start:test_end]
    y_test_fold = y_shuffled[test_start:test_end]
    
    
    # Use the remaining data as training set
    X_train_fold = np.concatenate([X_shuffled[:test_start], X_shuffled[test_end:]])
    y_train_fold = np.concatenate([y_shuffled[:test_start], y_shuffled[test_end:]])
    result = knn(best_K, X_train_fold, y_train_fold, X_test_fold, y_test_fold)
    # print(result)
    precision_0.append(result[0][0])
    precision_1.append(result[0][1])
    recall_0.append(result[1][0])
    recall_1.append(result[1][1])
    f_score_0.append(result[2][0])
    f_score_1.append(result[2][1])
    support_0.append(result[3][0])
    support_1.append(result[3][1])
    accuracy.append(result[4])
print("--- Recall ---")
print(f"recall_0 -> {(recall_0)}")
print(f"mean recall_0 -> {np.mean(recall_0)}")
print(f"recall_1 -> {(recall_1)}")
print(f"mean recall_1 -> {np.mean(recall_1)}")
print("\n--- Precision ---")
print(f"precision_0 -> {(precision_0)}")
print(f"mean precision_0 -> {np.mean(precision_0)}")
print(f"precision_1 -> {(precision_1)}")
print(f"mean precision_1 -> {np.mean(precision_1)}")
print("\n--- F-Score ---")
print(f"f-score_0 -> {(f_score_0)}")
print(f"mean f-score_0 -> {np.mean(f_score_0)}")
print(f"f-score_1 -> {(f_score_1)}")
print(f"mean f-score_1 -> {np.mean(f_score_1)}")
print(f"\n--- Accuracy ---")
print(f"accuracy for 1 -> {accuracy}")
print(f"mean accuracy for 1 -> {np.mean(accuracy)}")
print("\n--- Support ---")
print(f"support 0 -> {support_0}")
print(f"support 1 -> {support_1}")
    

## Part 2

### Breast cancer prediction

In [None]:
# Website -> https://www.kaggle.com/datasets/fatemehmehrparvar/breast-cancer-prediction
cancerData = pd.read_csv("breast-cancer-dataset.csv")
cancerData
del cancerData["S/N"]
del cancerData["Year"]
cancerData

### Pre-process

In [None]:
cancerData = cancerData.dropna()
cancerData = cancerData[~cancerData.apply(lambda row: row.astype(str).str.contains('#').any(), axis=1)]
cancerData["Breast Quadrant"] = cancerData["Breast Quadrant"].str.strip()
cancerData = cancerData.rename({"Diagnosis Result": "diagnosis"}, axis=1)


cancerData["Breast"] = cancerData["Breast"].replace("Left", 0)
cancerData["Breast"] = cancerData["Breast"].replace("Right", 1)
cancerData["Breast Quadrant"] = cancerData["Breast Quadrant"].replace("Lower inner", 0)
cancerData["Breast Quadrant"] = cancerData["Breast Quadrant"].replace("Lower outer", 1)
cancerData["Breast Quadrant"] = cancerData["Breast Quadrant"].replace("Upper inner", 2)
cancerData["Breast Quadrant"] = cancerData["Breast Quadrant"].replace("Upper outer", 3)
cancerData["diagnosis"] = cancerData["diagnosis"].replace("Benign", 0)
cancerData["diagnosis"] = cancerData["diagnosis"].replace("Malignant", 1)

display(cancerData)

## Standardization

In [None]:
cancerData["Menopause"] = cancerData["Menopause"].astype("float")
cancerData["Age"] = cancerData["Age"].astype("float")
cancerData["Tumor Size (cm)"] = cancerData["Tumor Size (cm)"].astype("float")
cancerData["Inv-Nodes"] = cancerData["Inv-Nodes"].astype("float")
cancerData["Breast"] = cancerData["Breast"].astype("float")
cancerData["Metastasis"] = cancerData["Metastasis"].astype("float")
cancerData["Breast Quadrant"] = cancerData["Breast Quadrant"].astype("float")
cancerData["History"] = cancerData["History"].astype("float")

cancerData["Age"] = (cancerData.Age - cancerData.Age.mean()) / cancerData.Age.std()
cancerData["Menopause"] = (cancerData.Menopause - cancerData.Menopause.mean()) / cancerData.Menopause.std()
cancerData["Tumor Size (cm)"] = (cancerData["Tumor Size (cm)"] - cancerData["Tumor Size (cm)"].mean()) / cancerData["Tumor Size (cm)"].std()
cancerData["Inv-Nodes"] = (cancerData["Inv-Nodes"] - cancerData["Inv-Nodes"].mean()) / cancerData["Inv-Nodes"].std()
cancerData["Breast"] = (cancerData["Breast"] - cancerData["Breast"].mean()) / cancerData["Breast"].std()
cancerData["Metastasis"] = (cancerData["Metastasis"] - cancerData["Metastasis"].mean()) / cancerData["Metastasis"].std()
cancerData["Breast Quadrant"] = (cancerData["Breast Quadrant"] - cancerData["Breast Quadrant"].mean()) / cancerData["Breast Quadrant"].std()
cancerData["History"] = (cancerData["History"] - cancerData["History"].mean()) / cancerData["History"].std()


In [None]:
cancerData.columns

## Feature Selection

In [None]:
fig, axes = plt.subplots(2, 4,figsize=(30, 10))
fig.suptitle('Correlation between features and diagnosis', fontsize=20)

sns.regplot(x='diagnosis', y='Age', data=cancerData, ax=axes[0,0])
axes[0, 0].set_xlabel("Age", fontsize=20)
axes[0, 0].set_ylabel("diagnosis", fontsize=20)

sns.regplot(x='diagnosis', y='Menopause', data=cancerData, ax=axes[1,0])
axes[1,0].set_xlabel("Menopause", fontsize=20)
axes[1,0].set_ylabel("diagnosis", fontsize=20)

sns.regplot(x='diagnosis', y='Tumor Size (cm)', data=cancerData, ax=axes[0,1])
axes[0,1].set_xlabel("Tumor Size (cm)", fontsize=20)
axes[0,1].set_ylabel("diagnosis", fontsize=20)

sns.regplot(x='diagnosis', y='Inv-Nodes', data=cancerData, ax=axes[1,1])
axes[1,1].set_xlabel("Inv-Nodes", fontsize=20)
axes[1,1].set_ylabel("diagnosis", fontsize=20)

sns.regplot(x='diagnosis', y='Breast', data=cancerData, ax=axes[0,2])
axes[0,2].set_xlabel("Breast", fontsize=20)
axes[0,2].set_ylabel("diagnosis", fontsize=20)

sns.regplot(x='diagnosis', y='Metastasis', data=cancerData, ax=axes[1,2])
axes[1,2].set_xlabel("Metastasis", fontsize=20)
axes[1,2].set_ylabel("diagnosis", fontsize=20)

sns.regplot(x='diagnosis', y='Breast Quadrant', data=cancerData, ax=axes[0,3])
axes[0,3].set_xlabel("Breast Quadrant", fontsize=20)
axes[0,3].set_ylabel("diagnosis", fontsize=20)

sns.regplot(x='diagnosis', y='History', data=cancerData, ax=axes[1,3])
axes[1,3].set_xlabel("History", fontsize=20)
axes[1,3].set_ylabel("diagnosis", fontsize=20)

In [None]:
features = cancerData.iloc[:, :-1]

# Extract the class label (last column)
class_label = cancerData.iloc[:, -1]

slopes = {}

# Calculate slope between each feature and the class label
for feature_name, feature_data in features.items():
    slope, _ = np.polyfit(feature_data, class_label, 1)
    slopes[feature_name] = (slope)
slopes_sorted = dict(sorted(slopes.items(), key=lambda item: item[1], reverse=True))
print(slopes_sorted)

In [None]:
disease_1 = cancerData[cancerData.diagnosis==1]
disease_0 = cancerData[cancerData.diagnosis==0]
figs, axes = plt.subplots(2, 4, figsize=(30, 10), sharex=False)
figs.suptitle('Distribution plot of each feature', fontsize=20)
sns.distplot(disease_1["History"], label='D', kde_kws={'bw':1}, ax=axes[0, 0], bins=5)
sns.distplot(disease_0["History"], label='H', kde_kws={'bw':1}, ax=axes[0, 0], bins=5)
axes[0, 0].set_xlabel("History", fontsize=20)
axes[0, 0].set_ylabel("density", fontsize=20)

sns.distplot(disease_1["Breast Quadrant"], label='D', kde_kws={'bw':1}, ax=axes[0, 1], bins=5)
sns.distplot(disease_0["Breast Quadrant"], label='H', kde_kws={'bw':1}, ax=axes[0, 1], bins=5)
axes[0, 1].set_xlabel("Breast Quadrant", fontsize=20)
axes[0, 1].set_ylabel("density", fontsize=20)

sns.distplot(disease_1["Metastasis"], label='D', kde_kws={'bw':0.5}, ax=axes[0, 2], bins=5)
sns.distplot(disease_0["Metastasis"], label='H', kde_kws={'bw':0.5}, ax=axes[0, 2], bins=5)
axes[0, 2].set_xlabel("Metastasis", fontsize=20)
axes[0, 2].set_ylabel("density", fontsize=20)

sns.distplot(disease_1["Breast"], label='D', kde_kws={'bw':1}, ax=axes[0, 3], bins=5)
sns.distplot(disease_0["Breast"], label='H', kde_kws={'bw':1}, ax=axes[0, 3], bins=5)
axes[0, 3].set_xlabel("Breast", fontsize=20)
axes[0, 3].set_ylabel("density", fontsize=20)

sns.distplot(disease_1['Inv-Nodes'], label='D', kde_kws={'bw':0.5}, ax=axes[1, 0], bins=5)
sns.distplot(disease_0['Inv-Nodes'], label='H', kde_kws={'bw':0.5}, ax=axes[1, 0], bins=5)
axes[1, 0].set_xlabel("Inv-Nodes", fontsize=20)
axes[1, 0].set_ylabel("density", fontsize=20)

sns.distplot(disease_1["Tumor Size (cm)"], label='D', kde_kws={'bw':1}, ax=axes[1, 1], bins=5)
sns.distplot(disease_0["Tumor Size (cm)"], label='H', kde_kws={'bw':1}, ax=axes[1, 1], bins=5)
axes[1, 1].set_xlabel("Tumor Size (cm)", fontsize=20)
axes[1, 1].set_ylabel("density", fontsize=20)

sns.distplot(disease_1.Menopause, label='D', kde_kws={'bw':0.5}, ax=axes[1, 2], bins=5)
sns.distplot(disease_0.Menopause, label='H', kde_kws={'bw':0.5}, ax=axes[1, 2], bins=5)
axes[1, 2].set_xlabel("Menopause", fontsize=20)
axes[1, 2].set_ylabel("density", fontsize=20)

sns.distplot(disease_1.Age, label='D', kde_kws={'bw':1}, ax=axes[1, 3], bins=5)
sns.distplot(disease_0.Age, label='H', kde_kws={'bw':1}, ax=axes[1, 3], bins=5)
axes[1, 3].set_xlabel("Age", fontsize=20)
axes[1, 3].set_ylabel("density", fontsize=20)

### T-test for just confirming we mentioned the p-value in report!

In [None]:
from scipy.stats import ttest_ind
significant_results = {}

for feature in cancerData.columns[:-1]: 
    t_statistic, p_value = ttest_ind(disease_1[feature], disease_0[feature])
    
    
    if p_value < 0.05:
        # Store the t-statistic and p-value in the dictionary
        significant_results[feature] = {'T-Statistic': t_statistic, 'P-Value': p_value}

print(significant_results)

For this part as we see the features named: Menopause, Metastatis, Age, and Tumor Size are suitable for further analyses

In [None]:
cancerData.columns

## Preparing Training and Testing Data

In [None]:
# X = kidneyData[["sg", "al", "pc", "pcv", "htn", "dm", "ane"]].values
X = cancerData[['Menopause', 'Metastasis', 'Age', 'Tumor Size (cm)', 'Inv-Nodes']].values
# Try this commented code instead of the above one to see the difference
# X = kidneyData[['age', 'su', 'pot','wc']].values
y = cancerData["diagnosis"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

## Finding Best K

In [None]:
k_range = range(1, 51)
best_Ks = []
plt.figure(figsize=(10, 5))
# Find 3 best k values, take average
for i in range(2):
    # For k-values 1-50, compute an average f-score and take the k with the highest average f-score
    f_scores = [averageFScore(kValue, X, y) for kValue in k_range]
    best_Ks.append(f_scores.index(max(f_scores))+1)
    plt.plot(k_range,f_scores)
best_K = np.array(best_Ks).mean()
best_K = mode(best_Ks).mode[0]
plt.title("Average f-score vs. K value", fontsize=20)
plt.xlabel("K", fontsize=20)
plt.ylabel("f-score", fontsize=20)
plt.show()
print(f"best_k without average {max(best_Ks)}")

best_K = int(best_K)
print(f"Best K: {best_K}")
sns.displot(data=best_Ks,kde=True)
plt.title("Distribution of Best K Values")
plt.xlabel("K Value")
plt.ylabel("Count")

**NOTE:** The distribution has one bin that is dominating the others. In a situation like this, it is most suitable to use the mode for the best k (because the mode is the best value of k so much more often than the median and mean)

In the file `FindBestK.py`, there is code which runs a 10-fold cross-validation 1000 times. We will use the best k found by that code.

In [None]:
# FindBestK.py found that the best k is 10, so we will use that
best_K = 10

## Evaluating

In [None]:
result = knn(best_K, X_train, y_train, X_test, y_test)
print(f'precision={result[0]}, recall={result[1]}, f-score={result[2]}, support={result[3]}') 
print(f'accuracy={result[4]}') 

# 10-Fold Cross-Validation

In [None]:
k_fold = 10

# Calculate the number of samples per fold
fold_size = len(X) // k_fold

# Shuffle the dataset
X = np.array(X)
y = np.array(y)

# Shuffle the indices array
indices = np.arange(len(X))
np.random.shuffle(indices)

# Use the shuffled indices to shuffle both X and y
X_shuffled = X[indices]
y_shuffled = y[indices]

precision_0 = []
precision_1 = []
recall_0 = []
recall_1 = []
f_score_0 = []
f_score_1 = []
support_0 = []
support_1 = []
accuracy = []

for fold in range(k_fold):
    # Split the dataset into training and testing sets for this fold
    test_start = fold * fold_size
    test_end = (fold + 1) * fold_size
    X_test_fold = X_shuffled[test_start:test_end]
    y_test_fold = y_shuffled[test_start:test_end]
    
    
    # Use the remaining data as training set
    X_train_fold = np.concatenate([X_shuffled[:test_start], X_shuffled[test_end:]])
    y_train_fold = np.concatenate([y_shuffled[:test_start], y_shuffled[test_end:]])
    result = knn(best_K, X_train_fold, y_train_fold, X_test_fold, y_test_fold)
    # print(result)
    precision_0.append(result[0][0])
    precision_1.append(result[0][1])
    recall_0.append(result[1][0])
    recall_1.append(result[1][1])
    f_score_0.append(result[2][0])
    f_score_1.append(result[2][1])
    support_0.append(result[3][0])
    support_1.append(result[3][1])
    accuracy.append(result[4])

print("--- Recall ---")
print(f"recall_0 -> {(recall_0)}")
print(f"mean recall_0 -> {np.mean(recall_0)}")
print(f"recall_1 -> {(recall_1)}")
print(f"mean recall_1 -> {np.mean(recall_1)}")
print("\n--- Precision ---")
print(f"precision_0 -> {(precision_0)}")
print(f"mean precision_0 -> {np.mean(precision_0)}")
print(f"precision_1 -> {(precision_1)}")
print(f"mean precision_1 -> {np.mean(precision_1)}")
print("\n--- F-Score ---")
print(f"f-score_0 -> {(f_score_0)}")
print(f"mean f-score_0 -> {np.mean(f_score_0)}")
print(f"f-score_1 -> {(f_score_1)}")
print(f"mean f-score_1 -> {np.mean(f_score_1)}")
print(f"\n--- Accuracy ---")
print(f"accuracy for 1 -> {accuracy}")
print(f"mean accuracy for 1 -> {np.mean(accuracy)}")
print("\n--- Support ---")
print(f"support 0 -> {support_0}")
print(f"support 1 -> {support_1}")


### Based on the type of dataset that we have and the good feature selection we have done we are getting a high value in all these test but if we add a feature that does not have a high correlation with disease we may lose the results.