<a href="https://colab.research.google.com/github/sayahashemian/parkinson-data-balancing-methods/blob/main/parkinson_data_balancing_methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from scipy.spatial import distance
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
import random
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

## Parkinsons Dataset Data Loading

In [None]:
file_path = '/content/drive/My Drive/parkinsons.data'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

In [None]:
df.isnull().sum()

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64

In [None]:
df['status'].value_counts()

1    147
0     48
Name: status, dtype: int64

In [None]:
X = df.drop(columns=['name','status'], axis=1)
y = df['status']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
label_counts = y_train.value_counts()
print(label_counts)

1    118
0     38
Name: status, dtype: int64


In [None]:
X_train_negative = X_train[y_train == 0]
X_train_positive = X_train[y_train == 1]

## Method 1: Picking random sample from X_train_positive

In [None]:
np.random.seed(42)  # Ensure reproducibility
random_indices = np.random.choice(X_train_positive.index, 38, replace=False)
selected_positive_samples = X_train_positive.loc[random_indices]

selected_positive_labels = np.ones(len(selected_positive_samples))
negative_labels = np.zeros(len(X_train_negative))

In [None]:
X_train_balanced = pd.concat([selected_positive_samples, X_train_negative], axis=0)
y_train_balanced = np.concatenate([selected_positive_labels, negative_labels])

y_train_balanced = pd.Series(y_train_balanced, index=X_train_balanced.index)

# Shuffle the balanced dataset
X_train_balanced, y_train_balanced = shuffle(X_train_balanced, y_train_balanced, random_state=42)

# Verify label distribution
label_counts = y_train_balanced.value_counts()
print(label_counts)

1.0    38
0.0    38
dtype: int64


In [None]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_balanced, y_train_balanced)

In [None]:
y_pred = log_reg.predict(X_test)

# Output the classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.80      0.59        10
           1       0.91      0.69      0.78        29

    accuracy                           0.72        39
   macro avg       0.69      0.74      0.69        39
weighted avg       0.80      0.72      0.74        39

Confusion Matrix:
[[ 8  2]
 [ 9 20]]


## Method 2: K-means
*   number of cluster = 10
*   Picking 4 nearest sample from center of each cluster



In [None]:
n_clusters = 10

# Apply K-Means clustering to the positive class
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_train_positive)
centers = kmeans.cluster_centers_

nearest_samples_indices = []
for center in centers:
    distances = distance.cdist(X_train_positive, [center], 'euclidean').flatten()
    indices = np.argsort(distances)[:4]
    nearest_samples_indices.extend(indices)

selected_positive_samples = X_train_positive.iloc[nearest_samples_indices]

selected_positive_labels = np.ones(len(selected_positive_samples))
negative_labels = np.zeros(len(X_train_negative))

X_train_balanced = pd.concat([selected_positive_samples, X_train_negative], axis=0)
y_train_balanced = np.concatenate([selected_positive_labels, negative_labels])

y_train_balanced = pd.Series(y_train_balanced, index=X_train_balanced.index)

# Shuffle the balanced dataset
X_train_balanced, y_train_balanced = shuffle(X_train_balanced, y_train_balanced, random_state=42)

# Verify label distribution
label_counts = y_train_balanced.value_counts()
print(label_counts)

1.0    40
0.0    38
dtype: int64




In [None]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_balanced, y_train_balanced)

In [None]:
y_pred = log_reg.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.80      0.59        10
           1       0.91      0.69      0.78        29

    accuracy                           0.72        39
   macro avg       0.69      0.74      0.69        39
weighted avg       0.80      0.72      0.74        39

Confusion Matrix:
[[ 8  2]
 [ 9 20]]


In [None]:
DTC=DecisionTreeClassifier()
DTC.fit(X_train_balanced, y_train_balanced)
y_pred_DTC = DTC.predict(X_test)

In [None]:
print("Classification Report Decision Tree:")
print(classification_report(y_test, y_pred_DTC))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_DTC))

Classification Report Decision Tree:
              precision    recall  f1-score   support

           0       0.45      0.90      0.60        10
           1       0.95      0.62      0.75        29

    accuracy                           0.69        39
   macro avg       0.70      0.76      0.68        39
weighted avg       0.82      0.69      0.71        39

Confusion Matrix:
[[ 9  1]
 [11 18]]


## Method 3: K-means
*   number of cluster = 10
*   Picking 4 random sample from each cluster



In [None]:
n_clusters = 10

# Apply K-Means clustering to the positive class
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_train_positive)
labels = kmeans.labels_

# Select 10 random samples from each cluster
selected_indices = []
for i in range(n_clusters):
    cluster_indices = np.where(labels == i)[0] ##??
    selected_indices.extend(np.random.choice(cluster_indices, 4, replace=True))


# Use .iloc to ensure we're getting the right rows and keep track of indices
selected_positive_samples = X_train_positive.iloc[selected_indices]

# For labels, explicitly create Series or arrays representing the selected labels
selected_positive_labels = np.ones(len(selected_positive_samples))  # Create a labels array with all ones for positive samples
negative_labels = np.zeros(len(X_train_negative))  # Create a labels array with all zeros for negative samples

# Combine the datasets
X_train_balanced = pd.concat([selected_positive_samples, X_train_negative], axis=0)

# Combine the labels accordingly
y_train_balanced = np.concatenate([selected_positive_labels, negative_labels])

# Since concatenation loses the DataFrame structure, convert back if needed
y_train_balanced = pd.Series(y_train_balanced, index=X_train_balanced.index)

# Shuffle the balanced dataset
X_train_balanced, y_train_balanced = shuffle(X_train_balanced, y_train_balanced, random_state=42)

# Verify label distribution
label_counts = y_train_balanced.value_counts()
print(label_counts)

1.0    40
0.0    38
dtype: int64




In [None]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_balanced, y_train_balanced)

In [None]:
y_pred = log_reg.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.80      0.57        10
           1       0.90      0.66      0.76        29

    accuracy                           0.69        39
   macro avg       0.67      0.73      0.67        39
weighted avg       0.79      0.69      0.71        39

Confusion Matrix:
[[ 8  2]
 [10 19]]


In [None]:
DTC=DecisionTreeClassifier()
DTC.fit(X_train_balanced, y_train_balanced)
y_pred_DTC = DTC.predict(X_test)

In [None]:
print("Classification Report Decision Tree:")
print(classification_report(y_test, y_pred_DTC))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_DTC))

Classification Report Decision Tree:
              precision    recall  f1-score   support

           0       0.44      0.80      0.57        10
           1       0.90      0.66      0.76        29

    accuracy                           0.69        39
   macro avg       0.67      0.73      0.67        39
weighted avg       0.79      0.69      0.71        39

Confusion Matrix:
[[ 8  2]
 [10 19]]


## SMOTE Oversampling

In [None]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
num_status=y_train_smote.value_counts()
num_status

0    118
1    118
Name: status, dtype: int64

In [None]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_smote, y_train_smote)
y_pred = log_reg.predict(X_test)

In [None]:
print("Classification Report Logistic:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report Logistic:
              precision    recall  f1-score   support

           0       0.57      0.80      0.67        10
           1       0.92      0.79      0.85        29

    accuracy                           0.79        39
   macro avg       0.75      0.80      0.76        39
weighted avg       0.83      0.79      0.80        39

Confusion Matrix:
[[ 8  2]
 [ 6 23]]


In [None]:
DTC=DecisionTreeClassifier()
DTC.fit(X_train_smote, y_train_smote)
y_pred_DTC = DTC.predict(X_test)

In [None]:
print("Classification Report Decision Tree:")
print(classification_report(y_test, y_pred_DTC))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_DTC))

Classification Report Decision Tree:
              precision    recall  f1-score   support

           0       0.62      0.80      0.70        10
           1       0.92      0.83      0.87        29

    accuracy                           0.82        39
   macro avg       0.77      0.81      0.78        39
weighted avg       0.84      0.82      0.83        39

Confusion Matrix:
[[ 8  2]
 [ 5 24]]


## Over sampling with Genetic algorithm

In [None]:
X_train_non_fraud = X_train[y_train == 0]
X_train_fraud = X_train[y_train == 1]

In [None]:
def evaluate_dataset(X_train, y_train, X_test, y_test):
    classifier = DecisionTreeClassifier()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    return f1_score(y_test, y_pred)

def calculate_fitness(individual, X_train, y_train, X_test, y_test, k_neighbors, beta):
    nbrs = NearestNeighbors(n_neighbors=k_neighbors).fit(X_train)
    distances, indices = nbrs.kneighbors([individual])
    majority_neighbors_ratio = np.mean(y_train[indices[0]] == 1)

    if majority_neighbors_ratio >= 0.75:
        minority_label_weight = random.uniform(0.8, 1.0)
    elif majority_neighbors_ratio >= 0.5:
        minority_label_weight = random.uniform(0.6, 0.8)
    elif majority_neighbors_ratio >= 0.25:
        minority_label_weight = random.uniform(0.4, 0.6)
    else:
        minority_label_weight = random.uniform(0.2, 0.4)

    #F score (change in F1 score)
    original_f1_score = evaluate_dataset(X_train, y_train, X_test, y_test)
    temp_X_train = np.vstack((X_train, individual))
    temp_y_train = np.append(y_train, minority_class)
    new_f1_score = evaluate_dataset(temp_X_train, temp_y_train, X_test, y_test)
    delta_f1_score = new_f1_score - original_f1_score

    fitness = beta * minority_label_weight + (1 - beta) * delta_f1_score
    return fitness

def gen_sample(X_train, y_train, X_test, y_test, beta, minority_class, k_neighbors):
    minority_examples = X_train[y_train == minority_class]

    # Initialize population
    population = minority_examples.copy()
    fitness_scores = [calculate_fitness(ind, X_train, y_train, X_test, y_test, k_neighbors, beta) for ind in population]

    target_new_samples = 2 * len(minority_examples)  # Target is to double the minority class examples
    new_samples = []
    prev_eval_measure = 0
    curr_eval_measure = evaluate_dataset(X_train, y_train, X_test, y_test)

    while len(new_samples) < target_new_samples and curr_eval_measure >= prev_eval_measure:
        # Selection based on fitness
        fittest_index = np.argmax(fitness_scores)
        fittest1 = population[fittest_index]
        fitness_scores.pop(fittest_index)
        population = np.delete(population, fittest_index, axis=0)

        # Finding k-nearest neighbors among minority examples
        nbrs = NearestNeighbors(n_neighbors=k_neighbors).fit(minority_examples)
        distances, indices = nbrs.kneighbors([fittest1])

        # Randomly select one of the k-nearest neighbors as the second parent
        fittest2_index = random.choice(indices[0])
        fittest2 = minority_examples[fittest2_index]

        # Crossover
        lambda_value = random.uniform(0, 1)
        child1 = fittest1 + (fittest2 - fittest1) * lambda_value
        child2 = fittest1 + (fittest2 - fittest1) * (1 - lambda_value)

        # Evaluate children
        fit1_score = calculate_fitness(child1, X_train, y_train, X_test, y_test, k_neighbors, beta)
        fit2_score = calculate_fitness(child2, X_train, y_train, X_test, y_test, k_neighbors, beta)

        # Add the fitter child to the new samples and population
        if fit1_score > fit2_score:
            new_samples.append(child1)
            population = np.vstack((population, child1))
        else:
            new_samples.append(child2)
            population = np.vstack((population, child2))

        # Recalculate the fitness scores for the updated population
        fitness_scores = [calculate_fitness(ind, X_train, y_train, X_test, y_test, k_neighbors, beta) for ind in population]

        # Update the evaluation measure
        curr_eval_measure = evaluate_dataset(np.vstack((X_train, new_samples)), np.hstack((y_train, [minority_class] * len(new_samples))), X_test, y_test)

    # Return the dataset with the new samples added
    return np.vstack((X_train, new_samples)), np.hstack((y_train, [minority_class] * len(new_samples)))

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


minority_class = 0

beta = 0.5
k_neighbors = 5

X_train_resampled, y_train_resampled = gen_sample(X_train.to_numpy(), y_train.to_numpy(), X_test.to_numpy(), y_test.to_numpy(), beta, minority_class, k_neighbors)

In [None]:
resampled_data = np.hstack((X_train_resampled, y_train_resampled.reshape(-1, 1)))

resampled_df = pd.DataFrame(resampled_data, columns=list(X.columns) + ['status'])

resampled_df.head()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE,status
0,119.056,125.213,86.795,0.00346,3e-05,0.00169,0.0017,0.00508,0.01201,0.106,...,0.01898,0.00903,23.389,0.470972,0.721308,-5.436135,0.254909,2.51632,0.232209,1.0
1,125.641,141.068,116.346,0.03316,0.00026,0.02144,0.01522,0.06433,0.09178,0.891,...,0.16074,0.31482,8.867,0.671299,0.656846,-3.700544,0.260481,2.991063,0.370961,1.0
2,171.041,208.313,75.501,0.00455,3e-05,0.0025,0.00234,0.0075,0.01966,0.186,...,0.02666,0.01095,25.908,0.418622,0.720916,-6.18359,0.226278,2.589702,0.147403,1.0
3,119.031,127.533,109.216,0.0044,4e-05,0.00214,0.00192,0.00641,0.01033,0.098,...,0.01614,0.01724,26.842,0.457541,0.699787,-6.890021,0.152941,2.328513,0.112856,1.0
4,116.556,592.03,86.228,0.00496,4e-05,0.00254,0.00263,0.00762,0.0166,0.154,...,0.0246,0.01397,23.958,0.566424,0.667654,-6.431119,0.15331,2.161936,0.120605,0.0


In [None]:
X = resampled_df.drop(columns=['status'], axis=1)
y = resampled_df['status']

In [None]:
label_counts = y.value_counts()
print(label_counts)

1.0    99
0.0    93
Name: status, dtype: int64


In [None]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_resampled, y_train_resampled)

In [None]:
y_pred = log_reg.predict(X_test)

# Output the classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.82      0.72        17
           1       0.93      0.83      0.88        48

    accuracy                           0.83        65
   macro avg       0.78      0.83      0.80        65
weighted avg       0.85      0.83      0.84        65

Confusion Matrix:
[[14  3]
 [ 8 40]]


