In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data = pd.read_csv('./data/Hobby_Data.csv')
data.drop_duplicates(inplace=True)

In [3]:
def preprocess_data(df):
    df_processed = df.copy()
    bool_columns = ['Olympiad_Participation', 'Scholarship', 'School', 'Projects', 'Medals', 'Career_sprt', 'Act_sprt', 'Fant_arts']
    for col in bool_columns:
        df_processed[col] = df_processed[col].map({'Yes': 1, 'No': 0})
    df_processed['Won_arts'] = df_processed['Won_arts'].map({'Yes': 1, 'No': 0, 'Maybe': 0.5})
    le = LabelEncoder()
    df_processed['Fav_sub'] = le.fit_transform(df_processed['Fav_sub'])
    scaler = StandardScaler()
    numerical_columns = ['Grasp_pow', 'Time_sprt', 'Time_art']
    df_processed[numerical_columns] = scaler.fit_transform(df_processed[numerical_columns])
    return df_processed


In [4]:
data_processed = preprocess_data(data)
X = data_processed.drop(['Predicted Hobby'], axis=1)
y = data_processed['Predicted Hobby']
le_hobby = LabelEncoder()
y_encoded = le_hobby.fit_transform(y)


In [12]:
data_processed.to_csv("./data/processed_hobby_data.csv")

In [5]:
# Cosine Similarity Model
def cosine_similarity_model(X_train, y_train):
    return cosine_similarity(X_train)

def predict_cosine(cosine_sim, X_test, X_train, y_train):
    similarities = cosine_similarity(X_test, X_train)
    predictions = []
    for sim in similarities:
        most_similar_index = sim.argmax()
        predictions.append(y_train.iloc[most_similar_index])
    return predictions


In [6]:
# KNN Model
def knn_model(X_train, y_train, n_neighbors=5):
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    return knn

# Evaluation function
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [7]:
# K-fold cross-validation
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

cosine_scores = []
knn_scores = []

for fold, (train_index, val_index) in enumerate(kf.split(X), 1):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Cosine Similarity
    cosine_sim = cosine_similarity_model(X_train, y_train)
    y_pred_cosine = predict_cosine(cosine_sim, X_val, X_train, y_train)
    cosine_score = evaluate_model(y_val, y_pred_cosine)
    cosine_scores.append(cosine_score)

    # KNN
    knn = knn_model(X_train, y_train)
    y_pred_knn = knn.predict(X_val)
    knn_score = evaluate_model(y_val, y_pred_knn)
    knn_scores.append(knn_score)

    print(f"Fold {fold}:")
    print(f"Cosine Similarity: {cosine_score}")
    print(f"KNN: {knn_score}")
    print()


Fold 1:
Cosine Similarity: {'accuracy': 0.8467432950191571, 'precision': np.float64(0.8516600141541096), 'recall': np.float64(0.8467432950191571), 'f1': np.float64(0.8473712329956883)}
KNN: {'accuracy': 0.8697318007662835, 'precision': np.float64(0.871678125929425), 'recall': np.float64(0.8697318007662835), 'f1': np.float64(0.8698852328221854)}

Fold 2:
Cosine Similarity: {'accuracy': 0.8544061302681992, 'precision': np.float64(0.8578781797172602), 'recall': np.float64(0.8544061302681992), 'f1': np.float64(0.8550050217609642)}
KNN: {'accuracy': 0.8735632183908046, 'precision': np.float64(0.8744606417476425), 'recall': np.float64(0.8735632183908046), 'f1': np.float64(0.873867256451975)}

Fold 3:
Cosine Similarity: {'accuracy': 0.842911877394636, 'precision': np.float64(0.8429624499084747), 'recall': np.float64(0.842911877394636), 'f1': np.float64(0.8428721507141974)}
KNN: {'accuracy': 0.8773946360153256, 'precision': np.float64(0.8806985396190795), 'recall': np.float64(0.877394636015325

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Calculate average scores
avg_cosine_scores = {metric: np.mean([score[metric] for score in cosine_scores]) for metric in cosine_scores[0]}
avg_knn_scores = {metric: np.mean([score[metric] for score in knn_scores]) for metric in knn_scores[0]}

print("Average Scores:")
print(f"Cosine Similarity: {avg_cosine_scores}")
print(f"KNN: {avg_knn_scores}")


Average Scores:
Cosine Similarity: {'accuracy': np.float64(0.8582375478927204), 'precision': np.float64(0.8602319833199173), 'recall': np.float64(0.8582375478927204), 'f1': np.float64(0.8582335429585479)}
KNN: {'accuracy': np.float64(0.8735632183908045), 'precision': np.float64(0.8748961013964516), 'recall': np.float64(0.8735632183908045), 'f1': np.float64(0.8734446314597764)}


In [9]:
# Select the best model
if avg_cosine_scores['f1'] > avg_knn_scores['f1']:
    print("Cosine Similarity is the better model.")
    best_model = "Cosine Similarity"
else:
    print("KNN is the better model.")
    best_model = "KNN"


KNN is the better model.


In [10]:
# Example usage with the best model
def recommend_hobby(input_data, model_type):
    input_df = pd.DataFrame([input_data])
    input_processed = preprocess_data(input_df)
    
    if model_type == "Cosine Similarity":
        input_similarity = cosine_similarity(input_processed, X)
        most_similar_index = input_similarity.argmax()
        return y.iloc[most_similar_index]
    else:  # KNN
        knn = knn_model(X, y)
        return knn.predict(input_processed)[0]


In [11]:
# Example input
input_data = {
    'Olympiad_Participation': 'Yes',
    'Scholarship': 'Yes',
    'School': 'Yes',
    'Fav_sub': 'Mathematics',
    'Projects': 'Yes',
    'Grasp_pow': 5,
    'Time_sprt': 2,
    'Medals': 'Yes',
    'Career_sprt': 'No',
    'Act_sprt': 'No',
    'Fant_arts': 'No',
    'Won_arts': 'Maybe',
    'Time_art': 3
}

recommended_hobby = recommend_hobby(input_data, best_model)
print(f"Recommended Hobby using {best_model}: {recommended_hobby}")

Recommended Hobby using KNN: Academics


In [14]:
import pickle

# Save the model to a file
with open('./models/knn_model.pkl', 'wb') as file:
    pickle.dump(knn, file)
