In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv(filepath_or_buffer='./analysis_df.csv', delimiter=',')

In [7]:
model_df = df[df['Salary'].notna()]
# Create salary ranges
salary_ranges = pd.qcut(model_df['Salary'], q=3, labels=['Low', 'Medium', 'High'])

# Create Salary Range column
model_df['Salary Range'] = salary_ranges

# Select the relevant features and the target variable
features = ['Age', 'Title', 'Formal Education', 'Coding Experience', 'ML Experience', 'Country']
target = 'Salary Range'

# Preprocess the data by converting categorical variables into numerical representation
survey_data_encoded = pd.get_dummies(model_df[features])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(survey_data_encoded, model_df[target], test_size=0.2, random_state=42)

# Create the KNN classifier
knn_classifier = KNeighborsClassifier()

# Define the parameter grid for grid search
param_grid = {'n_neighbors': list(range(10, 30, 2))}

# Perform grid search with cross-validation
grid_search = GridSearchCV(knn_classifier, param_grid, cv=len(param_grid['n_neighbors']))
grid_search.fit(X_train, y_train)

# Get the best parameter value and the corresponding classifier
best_n_neighbors = grid_search.best_params_['n_neighbors']
best_classifier = grid_search.best_estimator_

# Make predictions on the test set using the best classifier
predictions = best_classifier.predict(X_test)

# Evaluate the model using accuracy score
accuracy = accuracy_score(y_test, predictions)
print("Best n_neighbors:", best_n_neighbors)
print("Accuracy:", accuracy)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['Salary Range'] = salary_ranges


Best n_neighbors: 26
Accuracy: 0.6584500098599881
