In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
df = pd.read_csv("./users.csv").sample(n=2000, random_state=42)

# Extract interest columns
interests = df.iloc[:, 4:]

# Compute cosine similarity
similarity_matrix = cosine_similarity(interests)

# Convert to DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=df["UserID"], columns=df["UserID"])


In [3]:
# Define age groups and similarity weight
age_groups = ['18-24', '25-34', '35-44', '45-54', '55-64', '65-74', '75+']
age_group_weights = {0: 1.0, 1: 0.8, 2: 0.6, 3: 0.4, 4: 0.2, 5: 0.1, 6: 0.05}

# Function to get age weight between two users
def age_similarity(user1_age, user2_age):
    index1 = age_groups.index(user1_age)
    index2 = age_groups.index(user2_age)
    distance = abs(index1 - index2)
    return age_group_weights.get(distance, 0.05)  # Default to lowest weight


In [4]:
def adjusted_similarity(df, similarity_df, user_id):
    user_row = df[df["UserID"] == user_id].iloc[0]
    user_country = user_row["Country"]
    user_age = user_row["AgeGroup"]
    
    # Adjust similarity scores
    sim_scores = similarity_df[user_id].copy()

    for other_user in df["UserID"]:
        if other_user == user_id:
            continue

        other_row = df[df["UserID"] == other_user].iloc[0]
        
        # Age similarity adjustment
        age_weight = age_similarity(user_age, other_row["AgeGroup"])
        
        # Country adjustment
        if user_country != other_row["Country"]:
            sim_scores[other_user] *= 0.5  # Reduce similarity by 50% if from different country
        
        # Apply age weight
        sim_scores[other_user] *= age_weight

    return sim_scores.sort_values(ascending=False).iloc[1:6]  # Top 5 similar users




In [5]:
top_similar_users = adjusted_similarity(df, similarity_df, 59951)
print(top_similar_users)

UserID
16265    0.309839
62123    0.288675
271      0.288675
15007    0.288675
51988    0.288675
Name: 59951, dtype: float64


In [9]:
def get_similar_users(df, similarity_df, user_id, top_n=5):
    user_row = df[df["UserID"] == user_id].iloc[0]
    user_country = user_row["Country"]
    user_age = user_row["AgeGroup"]
    
    # Adjust similarity scores
    sim_scores = similarity_df[user_id].copy()

    for other_user in df["UserID"]:
        if other_user == user_id:
            continue

        other_row = df[df["UserID"] == other_user].iloc[0]
        
        # Age similarity adjustment
        age_weight = age_similarity(user_age, other_row["AgeGroup"])
        
        # Country adjustment
        if user_country != other_row["Country"]:
            sim_scores[other_user] *= 0.5  # Reduce similarity by 50% if from different country
        
        # Apply age weight
        sim_scores[other_user] *= age_weight

    # Get top N similar users
    top_users = sim_scores.sort_values(ascending=False).iloc[1:top_n+1].index

    # Retrieve full details of top similar users
    similar_users = df[df["UserID"].isin(top_users)][["UserID", "Gender", "Country", "AgeGroup"] + list(df.columns[4:])]

    # Convert interests into a single "Interests" column
    interest_cols = df.columns[4:]  # Columns containing interests
    similar_users["Interests"] = similar_users[interest_cols].apply(lambda row: ", ".join(row.index[row == 1]), axis=1)

    # Drop individual interest columns
    similar_users = similar_users.drop(columns=interest_cols)

    return similar_users

# Example: Get top 5 similar users for user 123
user_id = 59951
top_similar_users = get_similar_users(df, similarity_df, user_id)

print(top_similar_users)


       UserID  Gender    Country AgeGroup  \
270       271    Male      China    25-34   
15006   15007  Female    Belgium    25-34   
51987   51988  Female    Germany    25-34   
62122   62123  Female     France    25-34   
16264   16265  Female  Venezuela    35-44   

                                               Interests  
270                                              Fashion  
15006                                         Technology  
51987                                            Fashion  
62122                                         Technology  
16264  Fashion, Health and wellness, Photography, Pol...  
