# Importing the neccesary libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
import re

# Load the data

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


# Data Preprocessing

In [None]:
# The data in Assets and liabilites are in the from of strings,so a function is used to convert them into numbers according to the suffix.
def get_num(string):
    numeric_part = re.search(r'\d+', string)
    if numeric_part:
        num = int(numeric_part.group())
        if 'Crore+' in string:
            return num * 10000000  
        elif 'Lac+' in string:
            return num * 100000  
        elif 'Thou+' in string:
            return num * 1000  
    return 0


# Applying the above function to both train_data and test_data
train_data['Total Assets'] = train_data['Total Assets'].apply(get_num)
train_data['Liabilities'] = train_data['Liabilities'].apply(get_num)

test_data['Total Assets'] = test_data['Total Assets'].apply(get_num)
test_data['Liabilities'] = test_data['Liabilities'].apply(get_num)

# Party and state are categorial variables so they are assigned unique numerical values
encoder = LabelEncoder()
train_data['Party'] = encoder.fit_transform(train_data['Party'])
test_data['Party'] = encoder.transform(test_data['Party'])
train_data['state'] = encoder.fit_transform(train_data['state'])
test_data['state'] = encoder.transform(test_data['state'])

# Remove the target variable and other variables on which the Education level do not depend much
X = train_data.drop(columns=['Education', 'ID', 'Candidate', 'Constituency ∇'])
y = train_data['Education']

# Split the training data into training and validation parts
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Defining the Model

In [None]:
pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    KNeighborsClassifier()
)

# Hyperparameters grid to select the hyperparameters which give the highest score
param_grid = {
    'kneighborsclassifier__n_neighbors': [3, 5, 10, 15],  
    'kneighborsclassifier__weights': ['uniform', 'distance'], 
    'kneighborsclassifier__p': [1, 2] 
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted')

# Fitting the appropriate models
grid_search.fit(X_train, y_train)

# Use the best model
best_model = grid_search.best_estimator_

# Applying the best model on the valuation data

In [None]:
# Make predictions on the train_data(validation part) using the model found above
y_pred = best_model.predict(X_val)
f1 = f1_score(y_val, y_pred, average='weighted')
print("Best F1 Score on Validation Set:", f1)

# Make predictions on the test_data
X_test = test_data.drop(columns=['ID', 'Candidate', 'Constituency ∇'])
test_predictions = best_model.predict(X_test)

# Output predictions to a CSV file
output_df = pd.DataFrame({'ID': test_data['ID'], 'Education': test_predictions})
output_df.to_csv('KNNimproved.csv', index=False)