In [69]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.neighbors import NearestNeighbors

In [30]:
# # loading from csv
# # Read CSV and replace empty values with NaN
# df = pd.read_csv("Datasets/data.csv", na_values=["", " "])

# # Display the DataFrame after replacing empty values with NaN
# print("DataFrame with NaN values:")
# print(df)

In [31]:
# Creating example datasets
data = {
    'Category': ['A', 'A', 'B', 'B', 'C', 'C', 'B', 'A'],
    'Value': [np.nan, np.nan, 20, 25, np.nan, 30, np.nan, 15],
    'Feature1': [1.2, np.nan, 2.4, 2.5, 3.0, np.nan, 3.1, 1.5],
    'Feature2': [np.nan, 0.5, np.nan, 1.0, 1.5, 2.0, np.nan, 0.3]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

Original DataFrame:
  Category  Value  Feature1  Feature2
0        A    NaN       1.2       NaN
1        A    NaN       NaN       0.5
2        B   20.0       2.4       NaN
3        B   25.0       2.5       1.0
4        C    NaN       3.0       1.5
5        C   30.0       NaN       2.0
6        B    NaN       3.1       NaN
7        A   15.0       1.5       0.3


In [None]:
# save this as a csv
df.to_csv('Datasets/data.csv', index=False)

In [33]:
# load the csv
df = pd.read_csv('Datasets/data.csv')

In [34]:
# Identifying Missing Data
missing_values = df.isnull().sum()
print("\nMissing Values Count:")
print(missing_values)


Missing Values Count:
Category    0
Value       4
Feature1    2
Feature2    3
dtype: int64


In [35]:
# Removing Missing Values
df_dropna = df.dropna()
print("\nDataFrame after Dropping Rows with Missing Values:")
print(df_dropna)
# Can you dropna for aspecific column?
df_dropna= df.dropna(subset=['Value'])
df_dropna.reset_index(inplace=True, drop=True)
print("\nDataFrame after Dropping Rows with Missing Values:")
print(df_dropna)


DataFrame after Dropping Rows with Missing Values:
  Category  Value  Feature1  Feature2
3        B   25.0       2.5       1.0
7        A   15.0       1.5       0.3

DataFrame after Dropping Rows with Missing Values:
  Category  Value  Feature1  Feature2
0        B   20.0       2.4       NaN
1        B   25.0       2.5       1.0
2        C   30.0       NaN       2.0
3        A   15.0       1.5       0.3


### Imputation Techniques

In [36]:
def track_missing(df, col):
    """
    Adds a new column to track missing values in the specified column.
    """
    df[f"{col}_missing"] = df[col].isna().astype(int)
    return df

In [37]:
# Column Average Imputation
df_filled_mean = df.copy()
df_filled_mean['Value'] = df_filled_mean['Value'].fillna(df_filled_mean['Value'].mean())
print("\nDataFrame after Column Average Imputation:")
print(df_filled_mean)


DataFrame after Column Average Imputation:
  Category  Value  Feature1  Feature2
0        A   22.5       1.2       NaN
1        A   22.5       NaN       0.5
2        B   20.0       2.4       NaN
3        B   25.0       2.5       1.0
4        C   22.5       3.0       1.5
5        C   30.0       NaN       2.0
6        B   22.5       3.1       NaN
7        A   15.0       1.5       0.3


In [38]:
# Groupby-Based Imputation
df_grouped = df.copy()
track_missing(df_grouped, 'Value')
df_grouped['Value'] = df_grouped.groupby('Category')['Value'].transform(lambda x: x.fillna(x.mean()))
print("\nDataFrame after Groupby-Based Imputation:")
print(df_grouped)


DataFrame after Groupby-Based Imputation:
  Category  Value  Feature1  Feature2  Value_missing
0        A   15.0       1.2       NaN              1
1        A   15.0       NaN       0.5              1
2        B   20.0       2.4       NaN              0
3        B   25.0       2.5       1.0              0
4        C   30.0       3.0       1.5              1
5        C   30.0       NaN       2.0              0
6        B   22.5       3.1       NaN              1
7        A   15.0       1.5       0.3              0


In [42]:
# Filling with Specific Values
df_specific = df.copy()
track_missing(df_specific, 'Feature2')
df_specific.loc[df_specific['Feature2'].isnull(), 'Feature2'] = 999  # Filling missing Feature2 values with 1.0
print("\nDataFrame after Filling with Specific Values:")
print(df_specific)


DataFrame after Filling with Specific Values:
  Category  Value  Feature1  Feature2  Feature2_missing
0        A    NaN       1.2     999.0                 1
1        A    NaN       NaN       0.5                 0
2        B   20.0       2.4     999.0                 1
3        B   25.0       2.5       1.0                 0
4        C    NaN       3.0       1.5                 0
5        C   30.0       NaN       2.0                 0
6        B    NaN       3.1     999.0                 1
7        A   15.0       1.5       0.3                 0


In [46]:
# Filling using functions
df_funct = df.copy()
track_missing(df_funct, 'Value')

# Define a general function for filling missing values
def fill_nan_generic(row, target_column, category_column, category_means, generic_mean):
    var1 = row[target_column]  # Target column (e.g., 'Value')
    var2 = row[category_column]  # Category column (e.g., 'Category')

    # If target column value is missing, fill with the category-specific mean or generic mean
    if pd.isnull(var1):
        return category_means.get(var2, generic_mean)  # Use category mean if available, else generic mean
    return var1

# Define means (you can compute these dynamically as well)
# Define means
# element_1_mean = df_funct[df_funct['Category'] == 'A']['Value'].mean()
# element_2_mean = df_funct[df_funct['Category'] == 'B']['Value'].mean()
# generic_mean = df_funct['Value'].mean()
category_means = {
    'A': 111111,
    'B': 222222
}

generic_mean = 999999  # Global mean

# Apply function to fill 'Value' column, using generalized method
df_funct['Value'] = df_funct.apply(fill_nan_generic, axis=1, target_column='Value', category_column='Category', category_means=category_means, generic_mean=generic_mean)

print("\nDataFrame after Filling with a Generalized Function:")
print(df_funct)


DataFrame after Filling with a Generalized Function:
  Category     Value  Feature1  Feature2  Value_missing
0        A  111111.0       1.2       NaN              1
1        A  111111.0       NaN       0.5              1
2        B      20.0       2.4       NaN              0
3        B      25.0       2.5       1.0              0
4        C  999999.0       3.0       1.5              1
5        C      30.0       NaN       2.0              0
6        B  222222.0       3.1       NaN              1
7        A      15.0       1.5       0.3              0


### Clustering Methods

In [74]:
print(df)

  Category  Value  Feature1  Feature2
0        A    NaN       1.2       NaN
1        A    NaN       NaN       0.5
2        B   20.0       2.4       NaN
3        B   25.0       2.5       1.0
4        C    NaN       3.0       1.5
5        C   30.0       NaN       2.0
6        B    NaN       3.1       NaN
7        A   15.0       1.5       0.3


In [87]:
# 1. Create a copy of the original DataFrame to preserve the original data
df_knn = df.copy()

# 2. Initialize the KNN imputer with the desired number of neighbors
imputer = KNNImputer(n_neighbors=2)

# 3. Apply One-Hot Encoding to categorical data
# We convert the 'Category' column into multiple binary columns (dummy variables)
df_encoded = pd.get_dummies(df, columns=['Category'])

# 4. Apply KNN imputation to the numerical columns (exclude 'Category' columns, which are now encoded)
# Select all columns starting from the second one, which excludes the original 'Category' column
numerical_data = df_encoded

# 5. Impute missing values in the numerical data
imputed_data = imputer.fit_transform(numerical_data)

# 6. Create a new DataFrame from the imputed data
df_knn = pd.DataFrame(imputed_data, columns=numerical_data.columns)

# 7. Insert the original 'Category' column back into the DataFrame (first column)
df_knn.insert(0, 'Category', df['Category'])

# 8. Print the DataFrame after KNN imputation
print("\nDataFrame after KNN Imputation:")
print(df_knn)



DataFrame after KNN Imputation:
  Category  Value  Feature1  Feature2  Category_A  Category_B  Category_C
0        A   22.5      1.20      0.40         1.0         0.0         0.0
1        A   20.0      1.35      0.50         1.0         0.0         0.0
2        B   20.0      2.40      1.00         0.0         1.0         0.0
3        B   25.0      2.50      1.00         0.0         1.0         0.0
4        C   27.5      3.00      1.50         0.0         0.0         1.0
5        C   30.0      3.05      2.00         0.0         0.0         1.0
6        B   22.5      3.10      1.25         0.0         1.0         0.0
7        A   15.0      1.50      0.30         1.0         0.0         0.0


In [89]:
df_knn_num=df_knn.copy()
df_knn_num.drop(columns=['Category'], inplace=True)

In [None]:
# df_knn.drop(columns=['Category_A', 'Category_B', 'Category_C'], inplace=True)

Lets check that KNN we will not encode categorical here to make it easier


In [104]:
# 1. Create a copy of the original DataFrame to preserve the original data
df_knn = df.copy()

# 2. Initialize the KNN imputer with the desired number of neighbors
imputer = KNNImputer(n_neighbors=2)


# 4. Apply KNN imputation to the numerical columns (exclude 'Category' columns, which are now encoded)
# Select all columns starting from the second one, which excludes the original 'Category' column
numerical_data = df_knn.drop(columns=['Category'])

# 5. Impute missing values in the numerical data
imputed_data = imputer.fit_transform(numerical_data)

# 6. Create a new DataFrame from the imputed data
df_knn = pd.DataFrame(imputed_data, columns=numerical_data.columns)

# 7. Insert the original 'Category' column back into the DataFrame (first column)
# df_knn.insert(0, 'Category', df['Category'])

# 8. Print the DataFrame after KNN imputation
print("\nDataFrame after KNN Imputation:")
print(df_knn)



DataFrame after KNN Imputation:
   Value  Feature1  Feature2
0   17.5      1.20      0.65
1   20.0      2.00      0.50
2   20.0      2.40      1.25
3   25.0      2.50      1.00
4   27.5      3.00      1.50
5   30.0      2.75      2.00
6   22.5      3.10      1.25
7   15.0      1.50      0.30


In [105]:
print(df)

print(df_knn)

  Category  Value  Feature1  Feature2
0        A    NaN       1.2       NaN
1        A    NaN       NaN       0.5
2        B   20.0       2.4       NaN
3        B   25.0       2.5       1.0
4        C    NaN       3.0       1.5
5        C   30.0       NaN       2.0
6        B    NaN       3.1       NaN
7        A   15.0       1.5       0.3
   Value  Feature1  Feature2
0   17.5      1.20      0.65
1   20.0      2.00      0.50
2   20.0      2.40      1.25
3   25.0      2.50      1.00
4   27.5      3.00      1.50
5   30.0      2.75      2.00
6   22.5      3.10      1.25
7   15.0      1.50      0.30


In [112]:
# Check the imputation distances and neighbors (optional debugging step)

# Get the fitted neighbors for debugging purposes
neighbors = NearestNeighbors(n_neighbors=2)
neighbors.fit(df_knn)  # Use only numerical data for nearest neighbor search
distances, indices = neighbors.kneighbors(df_knn)

# Display the nearest neighbors for the first few samples (for debugging)
print("\nNearest Neighbors and Distances for the first few rows:")
print("Indices of Nearest Neighbors:\n", indices)
print("Distances to Nearest Neighbors:\n", distances)

# Loop through and display the neighbors for the first few rows to understand the imputation process
for num, ind in enumerate(indices):
    # print the first row of df_knn
    print("\nNumber:", num)
    
    # Print the original row of the DataFrame (the one being imputed)
    print("\nRow Comparision", num, "of the DataFrame:")
    print(df.iloc[num:num+1])  # Slice to keep the row in DataFrame format
    print(df_knn.iloc[num:num+1])

    print("\n")

    # Loop through and print the nearest neighbors
    print("Nearest Neighbors for Row", num, "with Indices:", ind)
    for term in ind:
        print(df.iloc[term:term+1])  # Slice to keep the row in DataFrame format
    print("\n")



Nearest Neighbors and Distances for the first few rows:
Indices of Nearest Neighbors:
 [[0 7]
 [1 2]
 [2 1]
 [3 6]
 [4 5]
 [5 4]
 [6 3]
 [7 0]]
Distances to Nearest Neighbors:
 [[0.         2.54214476]
 [0.         0.85      ]
 [0.         0.85      ]
 [0.         2.58311827]
 [0.         2.56173769]
 [0.         2.56173769]
 [0.         2.58311827]
 [0.         2.54214476]]

Number: 0

Row Comparision 0 of the DataFrame:
  Category  Value  Feature1  Feature2
0        A    NaN       1.2       NaN
   Value  Feature1  Feature2
0   17.5       1.2      0.65


Nearest Neighbors for Row 0 with Indices: [0 7]
  Category  Value  Feature1  Feature2
0        A    NaN       1.2       NaN
  Category  Value  Feature1  Feature2
7        A   15.0       1.5       0.3



Number: 1

Row Comparision 1 of the DataFrame:
  Category  Value  Feature1  Feature2
1        A    NaN       NaN       0.5
   Value  Feature1  Feature2
1   20.0       2.0       0.5


Nearest Neighbors for Row 1 with Indices: [1 2]
  C

In [None]:
# Check the imputation distances and neighbors (optional debugging step)

# Get the fitted neighbors for debugging purposes
neighbors = NearestNeighbors(n_neighbors=2)
neighbors.fit(df_knn)
distances, indices = neighbors.kneighbors(df_knn)

# Display the nearest neighbors for the first few samples (for debugging)
print("\nNearest Neighbors and Distances for the first few rows:")
print("Indices of Nearest Neighbors:\n", indices)
print("Distances to Nearest Neighbors:\n", distances)

for num, ind in enumerate(indices):
    print("number:", num)
    
    # Print the num row of the DataFrame (using the correct row based on num)
    print("\nRow", num, "of the DataFrame:")
    print(df.iloc[num:num+1])  # Slice to keep the row in DataFrame format
    print("\n")

    # Loop through and print the other neighbors as DataFrame (using the 'ind' variable for neighbors)
    for term in ind:
        print(df.iloc[term:term+1])  # Slice to keep the row in DataFrame format
    print("\n")

number: 0

Row 0 of the DataFrame:
  Category  Value  Feature1  Feature2
0        A    NaN       1.2       NaN


  Category  Value  Feature1  Feature2
0        A    NaN       1.2       NaN
  Category  Value  Feature1  Feature2
7        A   15.0       1.5       0.3


number: 1

Row 1 of the DataFrame:
  Category  Value  Feature1  Feature2
1        A    NaN       NaN       0.5


  Category  Value  Feature1  Feature2
1        A    NaN       NaN       0.5
  Category  Value  Feature1  Feature2
2        B   20.0       2.4       NaN


number: 2

Row 2 of the DataFrame:
  Category  Value  Feature1  Feature2
2        B   20.0       2.4       NaN


  Category  Value  Feature1  Feature2
2        B   20.0       2.4       NaN
  Category  Value  Feature1  Feature2
1        A    NaN       NaN       0.5


number: 3

Row 3 of the DataFrame:
  Category  Value  Feature1  Feature2
3        B   25.0       2.5       1.0


  Category  Value  Feature1  Feature2
3        B   25.0       2.5       1.0
  Category