In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv('./data/Hobby_Data.csv')
data.drop_duplicates(inplace=True)

In [3]:
# Preprocess the data
def preprocess_data(df):
    # Create a copy of the dataframe
    df_processed = df.copy()
    
    # Convert boolean columns to int
    bool_columns = ['Olympiad_Participation', 'Scholarship', 'School', 'Projects', 'Medals', 'Career_sprt', 'Act_sprt', 'Fant_arts']
    for col in bool_columns:
        df_processed[col] = df_processed[col].map({'Yes': 1, 'No': 0})
    
    # Handle 'Won_arts' column
    df_processed['Won_arts'] = df_processed['Won_arts'].map({'Yes': 1, 'No': 0, 'Maybe': 0.5})
    
    # Encode categorical variables
    le = LabelEncoder()
    df_processed['Fav_sub'] = le.fit_transform(df_processed['Fav_sub'])
    
    # Scale numerical features
    scaler = StandardScaler()
    numerical_columns = ['Grasp_pow', 'Time_sprt', 'Time_art']
    df_processed[numerical_columns] = scaler.fit_transform(df_processed[numerical_columns])
    
    return df_processed


In [4]:
# Apply preprocessing
data_processed = preprocess_data(data)


In [14]:
# data_processed[data_processed.duplicated()]

Unnamed: 0,Olympiad_Participation,Scholarship,School,Fav_sub,Projects,Grasp_pow,Time_sprt,Medals,Career_sprt,Act_sprt,Fant_arts,Won_arts,Time_art,Predicted Hobby


In [5]:
# Separate features and target
X = data_processed.drop(['Predicted Hobby'], axis=1)
y = data_processed['Predicted Hobby']

# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(X)


In [6]:
# Create recommendation function
def recommend_hobby(input_data):
    # Preprocess input data
    input_df = pd.DataFrame([input_data])
    input_processed = preprocess_data(input_df) 
    
    # Calculate cosine similarity between input and all data points
    input_similarity = cosine_similarity(input_processed, X)
    
    # Find the index of the most similar data point
    most_similar_index = input_similarity.argmax()
    
    # Get the recommended hobby
    recommended_hobby = y.iloc[most_similar_index]
    
    return recommended_hobby


In [12]:
# Example usage
input_data = {
    'Olympiad_Participation': 'Yes',
    'Scholarship': 'Yes',
    'School': 'Yes',
    'Fav_sub': 'Mathematics',
    'Projects': 'Yes',
    'Grasp_pow': 5,
    'Time_sprt': 2,
    'Medals': 'Yes',
    'Career_sprt': 'No',
    'Act_sprt': 'No',
    'Fant_arts': 'No',
    'Won_arts': 'Maybe',
    'Time_art': 3
}

recommended_hobby = recommend_hobby(input_data)
print(f"Recommended Hobby: {recommended_hobby}")


Recommended Hobby: Academics


In [8]:
# Test with a few more examples
test_inputs = [
    {
        'Olympiad_Participation': 'Yes',
        'Scholarship': 'Yes',
        'School': 'Yes',
        'Fav_sub': 'Science',
        'Projects': 'Yes',
        'Grasp_pow': 4,
        'Time_sprt': 3,
        'Medals': 'No',
        'Career_sprt': 'No',
        'Act_sprt': 'No',
        'Fant_arts': 'No',
        'Won_arts': 'No',
        'Time_art': 2
    },
    {
        'Olympiad_Participation': 'Yes',
        'Scholarship': 'Yes',
        'School': 'Yes',
        'Fav_sub': 'Any language',
        'Projects': 'No',
        'Grasp_pow': 3,
        'Time_sprt': 1,
        'Medals': 'Yes',
        'Career_sprt': 'No',
        'Act_sprt': 'No',
        'Fant_arts': 'No',
        'Won_arts': 'No',
        'Time_art': 1
    }
]

for i, test_input in enumerate(test_inputs, 1):
    recommended_hobby = recommend_hobby(test_input)
    print(f"Test {i} - Recommended Hobby: {recommended_hobby}")

Test 1 - Recommended Hobby: Academics
Test 2 - Recommended Hobby: Academics
