In [23]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [24]:
# Load your CSV file (replace 'your_file.csv' with your actual file)
file_path = 'mat,heatremoved.csv'
data = pd.read_csv(file_path)

In [25]:
# Create label encoders for alphanumeric columns
alphanumeric_columns = ['Mat Code', 'Heat No', 'Wagon No']

In [45]:
label_encoders = {}
for column in alphanumeric_columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

In [27]:
# Define the number of neighbors for KNN imputation
n_neighbors = 5  # You can adjust this value based on your dataset

In [46]:

# Create a KNNImputer and fit it to your data
knn_imputer = KNNImputer(n_neighbors=n_neighbors)
imputed_data = knn_imputer.fit_transform(data)

In [47]:
# Convert the imputed data back to a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=data.columns)


In [48]:
# Inverse transform the encoded columns to get the original alphanumeric values
for column in alphanumeric_columns:
    imputed_df[column] = label_encoders[column].inverse_transform(imputed_df[column].astype(int))

In [53]:
# Define a function to iteratively impute NaN values one at a time based on neighbors
def iterative_impute(df, knn_neighbors, max_iterations=10):
    for _ in range(max_iterations):
        nan_mask = df.isna()
        if not nan_mask.any().any():
            break
        for column in df.columns:
            nan_rows = nan_mask[column]
            if nan_rows.any():
                knn_imputer_iter = KNNImputer(n_neighbors=knn_neighbors)
                imputed_values = knn_imputer_iter.fit_transform(df[nan_mask][column].values.reshape(-1, 1))
                df.loc[nan_rows, column] = imputed_values
    return df

In [54]:

# Fill remaining NaN values iteratively
imputed_df = iterative_impute(imputed_df, n_neighbors)

In [55]:
# Save the filled DataFrame to a new CSV file
imputed_df.to_csv('imputed_data.csv', index=False)

In [56]:
# Display the filled data
print(imputed_df)

       SNO  Mat Code  Heat No  Wagon No
0      1.0         3       60        26
1      2.0         3       59        26
2      3.0         3       59        26
3      4.0         4       56        10
4      5.0         3       65        10
..     ...       ...      ...       ...
479  480.0         4       27         9
480  481.0         0       27         9
481  482.0         0       65         9
482  483.0         0       27         9
483  484.0         0       27         9

[484 rows x 4 columns]
