In [1]:
# cosine similarity

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

data = pd.read_csv('Data_After_Transformation.csv')

features = data[['Zip Code', 'Student Enrollment',
       'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Minimally Proficient(%)',
       'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Partially Proficient(%)',
       'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Proficient(%)',
       'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Highly Proficient(%)',
       'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Minimally Proficient(%)',
       'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Partially Proficient(%)',
       'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Proficient(%)',
       'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Highly Proficient(%)',
       'End of Year Promotion (%)', 'American Indian/ Alaska Native', 'Asian',
       'Black', 'Hispanic', 'Native Hawaiian/ Pacific Islander', 'White',
       'Two or More Races', 'Male', 'Female', 'Grade Levels', 'County',
       'School Type', 'Anthem', 'Avondale', 'Buckeye', 'Chandler', 'El Mirage',
       'Fountain Hills', 'Gila Bend', 'Gilbert', 'Glendale', 'Goodyear',
       'Highley', 'Laveen', 'Mesa', 'Peoria', 'Phoenix', 'Queen Creek',
       'Scottsdale', 'Surprise', 'Tempe', 'Tolleson', 'Tonopah', 'Wickenburg']].values

# Normalize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Calculate pairwise cosine similarity
similarity_matrix = cosine_similarity(features_scaled)

# Function to find similar schools
def find_similar_schools(target_school_name, similarity_matrix, data, top_n=5):
    # Find the index of the target school
    target_school_index = data.index[data['School'] == target_school_name].tolist()[0]
    
    # Get similarity scores for the target school
    similarity_scores = similarity_matrix[target_school_index]
    
    # Sort indices based on similarity scores (excluding the target school itself)
    similar_school_indices = similarity_scores.argsort()[::-1][1:top_n+1]
    
    # Get the names of similar schools
    similar_schools = data.iloc[similar_school_indices]['School'].tolist()
    
    return similar_schools

# Example usage: Find top 5 similar schools for a given target school
target_school_index = 1 
target_school_name = data.loc[target_school_index, 'School'] 
similar_schools = find_similar_schools(target_school_name, similarity_matrix, data)
print(f"top 5 similar schools with \"{target_school_name}\" using cosine similarity are:")
for i, school in enumerate(similar_schools, 1):
    print(f"{i}. {school}")


top 5 similar schools with "Great Hearts Academies - Anthem Prep" using cosine similarity are:
1. Boulder Creek High School
2. Great Hearts Academies - Archway Veritas
3. Great Hearts Academies - North Phoenix Prep
4. Great Hearts Academies - Archway Chandler
5. Basis Phoenix



In [2]:
# Jaccard Similarity

from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import binarize

# Function to compute similarity scores using Jaccard similarity
def compute_similarity_scores_jaccard(features, target_index):
    # Binarize the features matrix
    binary_features = binarize(features)

    # Extract binary features of the target school
    target_features = binary_features[target_index]

    # Compute pairwise Jaccard distances
    distances = pairwise_distances(binary_features, [target_features], metric='jaccard')

    # Convert distances to similarity scores
    similarity_scores = 1 - distances.flatten()

    return similarity_scores

# Example usage
target_school_index = 1 
similarity_scores_jaccard = compute_similarity_scores_jaccard(features_scaled, target_school_index)

# Rank schools based on Jaccard similarity scores
sorted_indices_jaccard = sorted(range(len(similarity_scores_jaccard)), key=lambda i: similarity_scores_jaccard[i], reverse=True)
top_similar_schools_jaccard = sorted_indices_jaccard[1:6]  # Exclude the target school itself

target_school_name = data.loc[target_school_index, 'School']

# Print top similar schools using Jaccard similarity
print(f"top 5 similar schools with \"{target_school_name}\" using Jaccard similarity are:")
for i, idx in enumerate(top_similar_schools_jaccard, 1):
    school_name = data.iloc[idx]['School']
    print(f"{i}. {school_name}")


top 5 similar schools with "Great Hearts Academies - Anthem Prep" using Jaccard similarity are:
1. Great Hearts Academies - Archway Chandler
2. Great Hearts Academies - Archway Veritas
3. Great Hearts Academies - Scottsdale Prep
4. Boulder Creek High School
5. Santan Junior High School




In [3]:
# Euclidean Distance

from sklearn.metrics.pairwise import euclidean_distances

# Function to compute similarity scores using Euclidean distance
def compute_similarity_scores_euclidean(features, target_index):
    target_features = features[target_index]
    similarity_scores = []

    for feature in features:
        # Compute Euclidean distance
        euclidean_dist = euclidean_distances([target_features], [feature])[0][0]

        # Convert distance to similarity score (inverse of distance)
        similarity_score = 1 / (1 + euclidean_dist)  # Adjust to your preference

        similarity_scores.append(similarity_score)

    return similarity_scores

# Example usage
target_school_index = 1 
similarity_scores_euclidean = compute_similarity_scores_euclidean(features_scaled, target_school_index)

# Rank schools based on Euclidean similarity scores
sorted_indices_euclidean = sorted(range(len(similarity_scores_euclidean)), key=lambda i: similarity_scores_euclidean[i], reverse=True)
top_similar_schools_euclidean = sorted_indices_euclidean[1:6]  # Exclude the target school itself

target_school_name = data.loc[target_school_index, 'School']

# Print top similar schools using Euclidean distance
print(f"top 5 similar schools with \"{target_school_name}\" using Euclidean distance are:")
for i, idx in enumerate(top_similar_schools_euclidean, 1):
    school_name = data.iloc[idx]['School']
    print(f"{i}. {school_name}")


top 5 similar schools with "Great Hearts Academies - Anthem Prep" using Euclidean distance are:
1. Boulder Creek High School
2. Great Hearts Academies - Archway Veritas
3. Great Hearts Academies - North Phoenix Prep
4. Great Hearts Academies - Veritas Prep
5. Basis Mesa


In [4]:
# KNN

import pandas as pd
from sklearn.neighbors import NearestNeighbors

# Load the dataset
data = pd.read_csv('Data_After_Transformation.csv')

# Fit the KNN model
k = 6  # Number of neighbors to consider
knn_model = NearestNeighbors(n_neighbors=k, algorithm='auto', metric='euclidean')
knn_model.fit(features)

# Function to find similar schools
def find_similar_schools(target_school_index, data, top_n=5):
    # Find k-nearest neighbors
    distances, indices = knn_model.kneighbors([features[target_school_index]])
    
    # Get the indices of similar schools
    similar_school_indices = indices[0][1:]  # Exclude the target school itself
    
    # Get the names of similar schools
    similar_schools = data.iloc[similar_school_indices]['School'].tolist()
    
    return similar_schools

# Example usage: Find top 5 similar schools for a given target school
target_school_index = 1  # Index of the target school in the dataset
target_school_name = data.loc[target_school_index, 'School']  # Get the name of the target school
similar_schools = find_similar_schools(target_school_index, data)
print(f"Top 5 similar schools to {target_school_name} using KNN:")
for i, school in enumerate(similar_schools, 1):
    print(f"{i}. {school}")


Top 5 similar schools to Great Hearts Academies - Anthem Prep using KNN:
1. Sossaman Middle School
2. Highland Jr High School
3. Stapley Junior High School
4. Great Hearts Academies - Scottsdale Prep
5. Great Hearts Academies - Veritas Prep


In [5]:
# SVM

import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('Data_After_Transformation.csv')


# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Train the SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(features_scaled, data['School'])

# Function to find similar schools using SVM
def find_similar_schools_svm(target_school_index, data, features_scaled, svm_model, top_n=5):
    # Get the decision function values for all schools
    decision_function_values = svm_model.decision_function(features_scaled)
    
    # Calculate the distance of each school from the decision boundary
    distances = abs(decision_function_values)
    
    # Sort indices based on distance (excluding the target school itself)
    sorted_indices = distances.argsort()
    similar_school_indices = sorted_indices[sorted_indices != target_school_index][:top_n]
    
    # Get the names of similar schools
    similar_schools = data.iloc[similar_school_indices]['School'].tolist()
    
    return similar_schools

# Example usage: Find top 5 similar schools for a given target school using SVM
target_school_index = 1  # Index of the target school in the dataset
target_school_name = data.loc[target_school_index, 'School']  # Get the name of the target school
similar_schools_svm = find_similar_schools_svm(target_school_index, data, features_scaled, svm_model)
print(f"Top 5 similar schools to {target_school_name} using SVM:")
for i, school in enumerate(similar_schools_svm, 1):
    print(f"{i}. {school}")

Top 5 similar schools to Great Hearts Academies - Anthem Prep using SVM:
1. Rhodes Junior High School
2. Metropolitan Arts Institute
3. Mesa Distance Learning Program
4. East Valley High School
5. Ombudsman - Charter Northeast


In [6]:
# Decision Tree

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('Data_After_Transformation.csv')

# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Train the Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(features_scaled, data['School'])

# Function to find similar schools using Decision Trees
def find_similar_schools_dt(target_school_index, data, features_scaled, dt_model, top_n=5):
    # Get the feature values for the target school
    target_school_features = [features_scaled[target_school_index]]
    
    # Predict the class probabilities for all schools
    class_probabilities = dt_model.predict_proba(features_scaled)
    
    # Calculate the Euclidean distance between the target school and all other schools
    distances = ((class_probabilities - dt_model.predict_proba(target_school_features))**2).sum(axis=1)
    
    # Sort indices based on distance (excluding the target school itself)
    sorted_indices = distances.argsort()
    similar_school_indices = sorted_indices[sorted_indices != target_school_index][:top_n]
    
    # Get the names of similar schools
    similar_schools = data.iloc[similar_school_indices]['School'].tolist()
    
    return similar_schools

# Example usage: Find top 5 similar schools for a given target school using Decision Trees
target_school_index = 1  # Index of the target school in the dataset
target_school_name = data.loc[target_school_index, 'School']  # Get the name of the target school
similar_schools_dt = find_similar_schools_dt(target_school_index, data, features_scaled, dt_model)
print(f"Top 5 similar schools to {target_school_name} using Decision Trees:")
for i, school in enumerate(similar_schools_dt, 1):
    print(f"{i}. {school}")


Top 5 similar schools to Great Hearts Academies - Anthem Prep using Decision Trees:
1. Boulder Creek High School
2. Willow Canyon High School
3. Imagine Prep Surprise
4. Valley Vista High School
5. Arizona Charter Academy


In [9]:
# printing the results for Euclidean and Cosine

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import euclidean
import pandas as pd

# Load the dataset
data = pd.read_csv('Data_After_Transformation.csv')

# Normalize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Function to compute similarity scores using multiple techniques
def compute_similarity_scores(features, target_index):
    target_features = features[target_index]
    cosine_similarity_scores = []
    jaccard_similarity_scores = []
    euclidean_distances = []

    for feature in features:
        # Compute cosine similarity
        cosine_sim = cosine_similarity([target_features], [feature])[0][0]
        cosine_similarity_scores.append(cosine_sim)

        # Compute Euclidean distance
        euclidean_dist = euclidean(target_features, feature)
        euclidean_distances.append(euclidean_dist)

    return cosine_similarity_scores, euclidean_distances

# Example usage
target_school_index = 0  # Index of the target school
cosine_similarity_scores, euclidean_distances = compute_similarity_scores(features_scaled, target_school_index)

# Rank schools based on cosine similarity scores
sorted_indices_cosine = sorted(range(len(cosine_similarity_scores)), key=lambda i: cosine_similarity_scores[i], reverse=True)
top_similar_schools_cosine = sorted_indices_cosine[1:6]  # Exclude the target school itself

# Rank schools based on Euclidean distances
sorted_indices_euclidean = sorted(range(len(euclidean_distances)), key=lambda i: euclidean_distances[i])
top_similar_schools_euclidean = sorted_indices_euclidean[1:6]  # Exclude the target school itself

# Print top similar schools using cosine similarity
print("Similar schools using cosine similarity:")
for i, idx in enumerate(top_similar_schools_cosine, 1):
    school_name = data.iloc[idx]['School']
    print(f"{i}. {school_name}")

# Print top similar schools using Euclidean distance
print("\nSimilar schools using Euclidean distance:")
for i, idx in enumerate(top_similar_schools_euclidean, 1):
    school_name = data.iloc[idx]['School']
    print(f"{i}. {school_name}")


Similar schools using cosine similarity:
1. Great Hearts Academies - Anthem Prep
2. Red Mountain High School
3. Desert Vista High School
4. Liberty High School
5. Mountain View High School

Similar schools using Euclidean distance:
1. Great Hearts Academies - Anthem Prep
2. Greenway, High School
3. Moon Valley High School
4. Red Mountain High School
5. Chaparral High School


In [11]:
# Getting model accuracy for Euclidean & Cosine

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('Data_After_Transformation.csv')

# Select features
features = data[['Zip Code', 'Student Enrollment',
                 'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Minimally Proficient(%)',
                 'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Partially Proficient(%)',
                 'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Proficient(%)',
                 'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Highly Proficient(%)',
                 'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Minimally Proficient(%)',
                 'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Partially Proficient(%)',
                 'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Proficient(%)',
                 'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Highly Proficient(%)',
                 'End of Year Promotion (%)']].values

# Normalize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Function to compute similarity scores using cosine similarity and Euclidean distance
def compute_similarity_scores(features, target_index):
    target_features = features[target_index]
    cosine_similarity_scores = []
    euclidean_distances = []

    for feature in features:
        # Compute cosine similarity
        cosine_sim = cosine_similarity([target_features], [feature])[0][0]
        cosine_similarity_scores.append(cosine_sim)

        # Compute Euclidean distance
        euclidean_dist = euclidean(target_features, feature)
        euclidean_distances.append(euclidean_dist)

    return cosine_similarity_scores, euclidean_distances

# Define the number of folds for cross-validation
k = 5

# Initialize KFold cross-validator
kf = KFold(n_splits=k)

# Initialize lists to store evaluation metrics
cosine_similarity_accuracy = []
euclidean_distance_accuracy = []

# Iterate over each fold
for train_index, test_index in kf.split(features_scaled):
    # Split data into training and testing sets for the current fold
    X_train, X_test = features_scaled[train_index], features_scaled[test_index]
    y_train, y_test = data.iloc[train_index]['School'], data.iloc[test_index]['School']
    
    # Perform similarity matching and evaluation for cosine similarity
    cosine_similarity_scores, _ = compute_similarity_scores(X_train, 0)  # Target index is assumed to be 0
    sorted_indices_cosine = sorted(range(len(cosine_similarity_scores)), key=lambda i: cosine_similarity_scores[i], reverse=True)
    top_similar_schools_cosine = sorted_indices_cosine[1:6]  # Exclude the target school itself
    
    # Compute accuracy for cosine similarity
    correct_predictions_cosine = sum(data.iloc[top_similar_schools_cosine]['School'].isin(y_test))
    accuracy_cosine = correct_predictions_cosine / len(top_similar_schools_cosine)
    cosine_similarity_accuracy.append(accuracy_cosine)
    
    # Perform similarity matching and evaluation for Euclidean distance
    _, euclidean_distances = compute_similarity_scores(X_train, 0)  # Target index is assumed to be 0
    sorted_indices_euclidean = sorted(range(len(euclidean_distances)), key=lambda i: euclidean_distances[i])
    top_similar_schools_euclidean = sorted_indices_euclidean[1:6]  # Exclude the target school itself
    
    # Compute accuracy for Euclidean distance
    correct_predictions_euclidean = sum(data.iloc[top_similar_schools_euclidean]['School'].isin(y_test))
    accuracy_euclidean = correct_predictions_euclidean / len(top_similar_schools_euclidean)
    euclidean_distance_accuracy.append(accuracy_euclidean)

# Calculate average accuracy across all folds for cosine similarity
average_accuracy_cosine = np.mean(cosine_similarity_accuracy)
print("Average Accuracy for Cosine Similarity:", average_accuracy_cosine)

# Calculate average accuracy across all folds for Euclidean distance
average_accuracy_euclidean = np.mean(euclidean_distance_accuracy)
print("Average Accuracy for Euclidean Distance:", average_accuracy_euclidean)


Average Accuracy for Cosine Similarity: 0.24
Average Accuracy for Euclidean Distance: 0.32


In [12]:
# Printing results for SVM, KNN & DT

import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('Data_After_Transformation.csv')

# Select features
features = data[['Zip Code', 'Student Enrollment',
                 'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Minimally Proficient(%)',
                 'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Partially Proficient(%)',
                 'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Proficient(%)',
                 'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Highly Proficient(%)',
                 'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Minimally Proficient(%)',
                 'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Partially Proficient(%)',
                 'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Proficient(%)',
                 'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Highly Proficient(%)',
                 'End of Year Promotion (%)']].values

# Normalize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Initialize KNN model
knn_model = NearestNeighbors(n_neighbors=6, algorithm='auto', metric='euclidean')
knn_model.fit(features_scaled)

# Initialize SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(features_scaled, data['School'])

# Initialize Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(features_scaled, data['School'])

# Function to find similar schools using KNN
def find_similar_schools_knn(target_school_index, data, features_scaled, knn_model, top_n=5):
    # Find k-nearest neighbors
    distances, indices = knn_model.kneighbors([features_scaled[target_school_index]])
    
    # Get the indices of similar schools
    similar_school_indices = indices[0][1:]  # Exclude the target school itself
    
    # Get the names of similar schools
    similar_schools = data.iloc[similar_school_indices]['School'].tolist()
    
    return similar_schools

# Function to find similar schools using SVM
def find_similar_schools_svm(target_school_index, data, features_scaled, svm_model, top_n=5):
    # Get the decision function values for all schools
    decision_function_values = svm_model.decision_function(features_scaled)
    
    # Calculate the distance of each school from the decision boundary
    distances = abs(decision_function_values)
    
    # Sort indices based on distance (excluding the target school itself)
    sorted_indices = distances.argsort()
    similar_school_indices = sorted_indices[sorted_indices != target_school_index][:top_n]
    
    # Get the names of similar schools
    similar_schools = data.iloc[similar_school_indices]['School'].tolist()
    
    return similar_schools

# Function to find similar schools using Decision Trees
def find_similar_schools_dt(target_school_index, data, features_scaled, dt_model, top_n=5):
    # Get the feature values for the target school
    target_school_features = [features_scaled[target_school_index]]
    
    # Predict the class probabilities for all schools
    class_probabilities = dt_model.predict_proba(features_scaled)
    
    # Calculate the Euclidean distance between the target school and all other schools
    distances = ((class_probabilities - dt_model.predict_proba(target_school_features))**2).sum(axis=1)
    
    # Sort indices based on distance (excluding the target school itself)
    sorted_indices = distances.argsort()
    similar_school_indices = sorted_indices[sorted_indices != target_school_index][:top_n]
    
    # Get the names of similar schools
    similar_schools = data.iloc[similar_school_indices]['School'].tolist()
    
    return similar_schools

# Example usage: Find top 5 similar schools for a given target school
target_school_index = 1  # Index of the target school in the dataset
target_school_name = data.loc[target_school_index, 'School']  # Get the name of the target school

# Find similar schools using KNN
similar_schools_knn = find_similar_schools_knn(target_school_index, data, features_scaled, knn_model)
print(f"Top 5 similar schools to {target_school_name} using KNN:")
for i, school in enumerate(similar_schools_knn, 1):
    print(f"{i}. {school}")

# Find similar schools using SVM
similar_schools_svm = find_similar_schools_svm(target_school_index, data, features_scaled, svm_model)
print(f"\nTop 5 similar schools to {target_school_name} using SVM:")
for i, school in enumerate(similar_schools_svm, 1):
    print(f"{i}. {school}")

# Find similar schools using Decision Trees
similar_schools_dt = find_similar_schools_dt(target_school_index, data, features_scaled, dt_model)
print(f"\nTop 5 similar schools to {target_school_name} using Decision Trees:")
for i, school in enumerate(similar_schools_dt, 1):
    print(f"{i}. {school}")


Top 5 similar schools to Great Hearts Academies - Anthem Prep using KNN:
1. Great Hearts Academies - Chandler Prep
2. Great Hearts Academies - Archway Veritas
3. Great Hearts Academies - Veritas Prep
4. Great Hearts Academies - North Phoenix Prep
5. Basis Phoenix Central Primary


Top 5 similar schools to Great Hearts Academies - Anthem Prep using SVM:
1. University High School
2. Sossaman Middle School
3. Mesa Academy for Advanced Studies
4. Camelback High School

5. Paragon Science Academy

Top 5 similar schools to Great Hearts Academies - Anthem Prep using Decision Trees:
1. Central High School
2. Central High School

3. Boulder Creek High School
4. Willow Canyon High School
5. Imagine Prep Surprise


In [16]:
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('Data_After_Transformation.csv')

# Select features
features = data[['Zip Code', 'Student Enrollment',
                 'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Minimally Proficient(%)',
                 'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Partially Proficient(%)',
                 'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Proficient(%)',
                 'State Wide Assessment Results - 2023 : ELA (English Language Arts) : ALL ENROLLED - Highly Proficient(%)',
                 'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Minimally Proficient(%)',
                 'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Partially Proficient(%)',
                 'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Proficient(%)',
                 'State Wide Assessment Results - 2023 All:MATH : ALL ENROLLED - Highly Proficient(%)',
                 'End of Year Promotion (%)']].values

# Normalize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Define the number of folds for cross-validation
k = 5

# Initialize KFold cross-validator
kf = KFold(n_splits=k)

# Initialize lists to store evaluation metrics
svm_accuracy = []
knn_accuracy = []
dt_accuracy = []

# Iterate over each fold
for train_index, test_index in kf.split(features_scaled):
    # Split data into training and testing sets for the current fold
    X_train, X_test = features_scaled[train_index], features_scaled[test_index]
    y_train, y_test = data.iloc[train_index]['School'], data.iloc[test_index]['School']
    
    # SVM
    svm_model = SVC(kernel='linear')
    svm_model.fit(X_train, y_train)
    accuracy_svm = svm_model.score(X_test, y_test)
    svm_accuracy.append(accuracy_svm)
    
    # KNN
    knn_model = NearestNeighbors(n_neighbors=6, algorithm='auto', metric='euclidean')
    knn_model.fit(X_train)
    _, indices = knn_model.kneighbors(X_test)
    knn_predictions = data.iloc[indices[:, 1:]].mode(axis=1).values.flatten()
    accuracy_knn = np.mean(knn_predictions == y_test.values)
    knn_accuracy.append(accuracy_knn)
    print("Indices shape:", indices.shape)

    
    # Decision Tree
    dt_model = DecisionTreeClassifier()
    dt_model.fit(X_train, y_train)
    accuracy_dt = dt_model.score(X_test, y_test)
    dt_accuracy.append(accuracy_dt)

# Calculate average accuracy across all folds for SVM
average_accuracy_svm = np.mean(svm_accuracy)
print("Average Accuracy for SVM:", average_accuracy_svm)

# Calculate average accuracy across all folds for KNN
average_accuracy_knn = np.mean(knn_accuracy)
print("Average Accuracy for KNN:", average_accuracy_knn)

# Calculate average accuracy across all folds for Decision Tree
average_accuracy_dt = np.mean(dt_accuracy)
print("Average Accuracy for Decision Tree:", average_accuracy_dt)


ValueError: Buffer has wrong number of dimensions (expected 1, got 2)