# Dummy DataSet Creation 

In [47]:
import pandas as pd
import random
from faker import Faker
import numpy as np
import hashlib
# Initialize Faker
fake = Faker()
Faker.seed(42)
random.seed(42)
np.random.seed(42)

# Define options for categorical fields
genders = ["Male", "Female", "Non-binary", "Prefer not to say"]
nationalities = ["Canadian", "American", "Indian", "Chinese", "Nigerian", "Brazilian", "German"]
languages = ["English", "French", "Mandarin", "Hindi", "Spanish", "Arabic"]
ethnicities = ["Asian", "Black", "White", "Latino", "Mixed", "Other"]
religions = ["Christian", "Muslim", "Hindu", "Jewish", "Buddhist", "Atheist", "Other"]
occupations = ["Software Engineer", "Doctor", "Teacher", "Artist", "Chef", "Researcher", "Student"]
housing_types = ["Apartment", "House", "Condo", "Others"]
living_spaces = ["Private room", "Shared room", "Entire place"]
yes_no = ["Yes", "No"]
wfh_options = ["WFH", "Go to Office", "Hybrid"]
work_schedules = ["Day", "Afternoon", "Overnight"]
diet_preferences = ["Vegetarian", "Vegan", "No restriction", "Others"]
hobbies_list = ["Reading", "Gaming", "Cooking", "Traveling", "Fitness", "Music", "Photography", "Hiking"]

# GTA locations
preferred_locations = [
    "Toronto", "Mississauga", "Brampton", "Markham", "Richmond Hill", "Vaughan", "Burlington",
    "Oakville", "Ajax", "Whitby", "Oshawa", "Pickering", "Aurora", "Newmarket", "Milton",
    "Caledon", "Halton Hills", "King", "East Gwillimbury", "Georgina"
]

# Generate 2,000 profiles
data = []
for _ in range(20000):
    name = fake.name()
    age = random.randint(18, 60)
    gender = random.choice(genders)
    nationality = random.choice(nationalities)
    language = random.choice(languages)
    ethnicity = random.choice(ethnicities)
    religion = random.choice(religions)
    occupation = random.choice(occupations)
    phone = fake.phone_number()
    email = fake.email()
    linkedin = f"https://www.linkedin.com/in/{fake.user_name()}"
    location = random.choice(preferred_locations)
    move_in = fake.date_between(start_date="today", end_date="+30d")
    move_out = fake.date_between(start_date=move_in, end_date="+180d")
    housing = random.choice(housing_types)
    space = random.choice(living_spaces)
    smoke = random.choice(yes_no)
    ok_with_smoke = random.choice(yes_no)
    pet = random.choice(yes_no)
    wfh = random.choice(wfh_options)
    schedule = random.choice(work_schedules)
    hobbies = ", ".join(random.sample(hobbies_list, k=random.randint(1, 4)))
    diet = random.choice(diet_preferences)

    row = [name, age, gender, nationality, language, ethnicity, religion, occupation, phone, email, linkedin,
           location, move_in, move_out, housing, space, smoke, ok_with_smoke, pet, wfh, schedule, hobbies, diet]
    data.append(row)

# Define column names
columns = ["Name", "Age", "Gender", "Nationality", "Primary Language", "Ethnicity", "Religion", "Occupation",
           "Phone Number", "Email", "LinkedIn", "Location", "Move-in Date", "Move-out Date",
           "Housing Type", "Living Space", "Do you smoke", "Comfortable with smokers", "Do you have pet",
           "Work Mode", "Work Schedule", "Hobbies & Activities", "Dietary Preference"]


# Define encryption function with fallback values
def encrypt_identity(row):
    name = row.get("Name", "")
    email = row.get("Email", "abc@xyz.com") if pd.notna(row.get("Email")) else "abc@xyz.com"
    phone = row.get("Phone Number", "123456789") if pd.notna(row.get("Phone Number")) else "123456789"
    combined = f"{name}_{email}_{phone}"
    return hashlib.sha256(combined.encode()).hexdigest()

# Create DataFrame
df = pd.DataFrame(data, columns=columns)
# Apply encryption
df["Encrypted_ID"] = df.apply(encrypt_identity, axis=1)

# Save to CSV (optional)
df.to_csv("/Users/sohommandal/Documents/Python_test/synthetic_roommate_profiles.csv", index=False)

# Display first few rows
df.head(10)


Unnamed: 0,Name,Age,Gender,Nationality,Primary Language,Ethnicity,Religion,Occupation,Phone Number,Email,...,Housing Type,Living Space,Do you smoke,Comfortable with smokers,Do you have pet,Work Mode,Work Schedule,Hobbies & Activities,Dietary Preference,Encrypted_ID
0,Allison Hill,58,Male,Canadian,Arabic,White,Muslim,Doctor,321.581.9600,williamjohnson@example.org,...,Apartment,Entire place,Yes,No,Yes,WFH,Day,"Traveling, Fitness",Vegetarian,8e5aa6f2103330567f361d1037212030e3ac8c743522a3...
1,Caitlin Henderson,53,Female,Brazilian,Arabic,Other,Buddhist,Artist,223-951-1615x594,dudleynicholas@example.net,...,Others,Entire place,No,Yes,Yes,Hybrid,Afternoon,"Fitness, Gaming, Photography",No restriction,64da87c18684713979834f064544f2f184f11149d1016b...
2,Darren Roberts,24,Male,Chinese,English,White,Other,Teacher,731-564-7525,jacqueline19@example.net,...,Condo,Private room,No,Yes,No,WFH,Overnight,"Music, Fitness, Gaming",Vegetarian,c04f2e200c125828c48eafee55c46f28f7f4201459f8fc...
3,Melissa Robinson,20,Female,German,Mandarin,Asian,Other,Doctor,903-405-6413x95376,donald88@example.com,...,Others,Shared room,No,No,Yes,Go to Office,Afternoon,"Fitness, Music",Vegetarian,3713e5598308323f9bc3b8d32f5c2aae4ddac795d37fc5...
4,Johnny Campos,56,Female,Nigerian,Arabic,Black,Muslim,Artist,922-669-1669,frazierdanny@example.net,...,Condo,Entire place,Yes,No,Yes,WFH,Day,"Photography, Cooking, Reading",Vegan,ed0e10020dae08a7280c3447fc70ca58b1763934d11779...
5,Nathan Cortez,54,Non-binary,American,Arabic,Latino,Jewish,Researcher,728.914.8932x52880,rodriguezmichael@example.com,...,House,Shared room,Yes,Yes,No,Hybrid,Overnight,"Photography, Cooking, Gaming, Music",Others,77cdc3fa297b4115ab22d9e7dbba95fcff5ed55b3ec8ff...
6,Mr. Philip Cannon,23,Male,German,English,Black,Atheist,Doctor,(578)624-8963,pcarney@example.com,...,Apartment,Shared room,No,No,No,Hybrid,Day,Fitness,No restriction,b80f9d8705c826b5d3e7fdb4cdc6af722178ba86f2ba10...
7,Kevin Hall,25,Non-binary,Chinese,French,Latino,Christian,Researcher,501.603.1051,ilewis@example.net,...,House,Entire place,Yes,No,Yes,WFH,Afternoon,"Reading, Fitness",No restriction,2934a348bcd3fadaebb4dcc81bfbdd5b40d067ee4fa938...
8,Jeffrey Henderson,49,Male,Canadian,Mandarin,White,Muslim,Software Engineer,401.806.5133x387,joshuawright@example.org,...,Apartment,Private room,No,Yes,Yes,WFH,Overnight,"Cooking, Hiking, Fitness, Music",Others,11d5f1d755d77659c1a147e354ebbd90f1d5eac1886228...
9,Alexis Davis,31,Female,Brazilian,Mandarin,Latino,Atheist,Researcher,001-932-667-7360x26064,uhorton@example.net,...,Others,Entire place,No,Yes,Yes,WFH,Day,"Reading, Fitness, Photography",Vegan,5f0f676c3fd301c93bbfc43bb483f5a6eca8ff025c5323...


In [38]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Use the previously generated dataframe or load if needed
# For demonstration, we continue with `df` from previous steps

# Step 1: Select features relevant to roommate matching
features = df[[
    "Gender", "Do you smoke", "Do you have pet", "Nationality",
    "Primary Language", "Dietary Preference", "Occupation", "Work Schedule"
]]

# Step 2: One-hot encode categorical variables
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(features).toarray()

# Step 3: Compute cosine similarity matrix between all users
similarity_matrix = cosine_similarity(encoded_features)

# Convert similarity matrix to DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=df["Encrypted_ID"], columns=df["Encrypted_ID"])

# Choose a user to find top matches (we pick the first user)
first_user = df["Encrypted_ID"].iloc[0]

# Find top 5 matches (excluding self)
top_matches_series = similarity_df[first_user].sort_values(ascending=False)[1:6]
top_match_ids = top_matches_series.index.tolist()
top_match_scores = top_matches_series.values.round(2)

# Filter and align matched users
top_matches_df = df[df["Encrypted_ID"].isin(top_match_ids)][[
    "Encrypted_ID", "Gender", "Nationality", "Do you smoke", "Do you have pet",
    "Primary Language", "Dietary Preference", "Occupation", "Work Schedule"
]]
top_matches_df = top_matches_df.set_index("Encrypted_ID").loc[top_match_ids].reset_index()
top_matches_df.insert(0, "Similarity Score", top_match_scores)
top_matches_df

Unnamed: 0,Similarity Score,Encrypted_ID,Gender,Nationality,Do you smoke,Do you have pet,Primary Language,Dietary Preference,Occupation,Work Schedule
0,0.87,4959a4707c3523e54d3ddb542c5786c811d02344f094de...,Male,Canadian,Yes,Yes,Arabic,Vegan,Doctor,Day
1,0.87,f5a6253e2e56d0542ff06d59801393090f18ccc87fb881...,Male,Canadian,Yes,Yes,English,Vegetarian,Doctor,Day
2,0.87,f151f54121bed58c7b5c46ec890536e82a232dd4a3aba9...,Male,Canadian,Yes,No,Arabic,Vegetarian,Doctor,Day
3,0.87,fa9bc60f4c9f1bf79bf5bcdd176e5c1c06a0a375bc6411...,Male,Canadian,Yes,Yes,Arabic,Vegetarian,Teacher,Day
4,0.87,55b664c39e1c732b6dba957b45a30fe06c89a7aa89c03e...,Male,Canadian,Yes,Yes,Arabic,Vegetarian,Artist,Day


# Test 2: Get 5 top roommate

In [None]:
# First, let's reconstruct a small sample dataset similar to the previous example for demonstration
sample_data = {
    "Name": ["Alice Smith", "Bob Johnson", "Clara Liu", "David Kim", "Ella Brown"],
    "Email": ["alice@example.com", "bob@example.com", "clara@example.com", "david@example.com", "ella@example.com"],
    "Phone Number": ["555-1234", "555-2345", "555-3456", "555-4567", "555-5678"],
    "Gender": ["Female", "Male", "Female", "Male", "Female"],
    "Do you smoke": ["No", "Yes", "No", "Yes", "No"],
    "Do you have pet": ["Yes", "No", "Yes", "No", "Yes"],
    "Nationality": ["Canadian", "Canadian", "Chinese", "Korean", "American"],
    "Primary Language": ["English", "English", "Mandarin", "Korean", "English"],
    "Dietary Preference": ["Vegan", "No restriction", "Vegetarian", "No restriction", "Vegan"],
    "Occupation": ["Doctor", "Engineer", "Artist", "Teacher", "Chef"],
    "Work Schedule": ["Day", "Afternoon", "Day", "Overnight", "Day"]
}

# Create DataFrame
df = pd.DataFrame(sample_data)

# Create Encrypted_ID column
import hashlib

def encrypt_identity(row):
    name = row.get("Name", "")
    email = row.get("Email", "abc@xyz.com") if pd.notna(row.get("Email")) else "abc@xyz.com"
    phone = row.get("Phone Number", "123456789") if pd.notna(row.get("Phone Number")) else "123456789"
    combined = f"{name}_{email}_{phone}"
    return hashlib.sha256(combined.encode()).hexdigest()

df["Encrypted_ID"] = df.apply(encrypt_identity, axis=1)

# Select features for similarity
features = df[[
    "Gender", "Do you smoke", "Do you have pet", "Nationality",
    "Primary Language", "Dietary Preference", "Occupation", "Work Schedule"
]]

# One-hot encode categorical variables
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(features).toarray()

# Compute cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(encoded_features)

# Create similarity DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=df["Encrypted_ID"], columns=df["Encrypted_ID"])

# Choose a user to find top matches (we pick the first user)
first_user = df["Encrypted_ID"].iloc[0]

# Find top 5 matches (excluding self)
top_matches_series = similarity_df[first_user].sort_values(ascending=False)[1:6]
top_match_ids = top_matches_series.index.tolist()
top_match_scores = top_matches_series.values.round(2)

# Filter and align matched users
top_matches_df = df[df["Encrypted_ID"].isin(top_match_ids)][[
    "Encrypted_ID", "Gender", "Nationality", "Do you smoke", "Do you have pet",
    "Primary Language", "Dietary Preference", "Occupation", "Work Schedule"
]]
top_matches_df = top_matches_df.set_index("Encrypted_ID").loc[top_match_ids].reset_index()
top_matches_df.insert(0, "Similarity Score", top_match_scores)
top_matches_df
# Display final result
#import ace_tools as tools; tools.display_dataframe_to_user(name="Top Roommate Matches Based on Profile", dataframe=top_matches_df)


Unnamed: 0,Similarity Score,Encrypted_ID,Gender,Nationality,Do you smoke,Do you have pet,Primary Language,Dietary Preference,Occupation,Work Schedule
0,0.75,a1eb6af2b397fdf414b46e311708d1f3b6fd4207acbb03...,Female,American,No,Yes,English,Vegan,Chef,Day
1,0.5,dafbdb7917376510f3b628ea293aa80426ac3e360323c5...,Female,Chinese,No,Yes,Mandarin,Vegetarian,Artist,Day
2,0.25,2e587966a8ee705882dfbd32dade0b186d6427b040eeb0...,Male,Canadian,Yes,No,English,No restriction,Engineer,Afternoon
3,0.0,cc3ab7e75c418a83e1e468e2b8736fe81a24113767b9ba...,Male,Korean,Yes,No,Korean,No restriction,Teacher,Overnight


# --- Test Case 1 for encrpytion ---

In [13]:
#### Test Case 
#Re-import required modules after reset
import pandas as pd
import hashlib

# Sample data reconstruction for demonstration
sample_data = {
    "Name": ["Alice Smith", "Bob Johnson"],
    "Email": ["alice@example.com", None],
    "Phone Number": ["555-1234", None]
}

df = pd.DataFrame(sample_data)

# Define encryption function with fallback values
def encrypt_identity(row):
    name = row.get("Name", "")
    email = row.get("Email", "abc@xyz.com") if pd.notna(row.get("Email")) else "abc@xyz.com"
    phone = row.get("Phone Number", "123456789") if pd.notna(row.get("Phone Number")) else "123456789"
    combined = f"{name}_{email}_{phone}"
    return hashlib.sha256(combined.encode()).hexdigest()

# Apply encryption
df["Encrypted_ID"] = df.apply(encrypt_identity, axis=1)
df


Unnamed: 0,Name,Email,Phone Number,Encrypted_ID
0,Alice Smith,alice@example.com,555-1234,ca1207070f355b0e64e448e27d0ab137e73379a77b6ee4...
1,Bob Johnson,,,6555bc7884f2723e1de768a2a197a1bee3679f0324f183...


In [44]:
import hashlib
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

def generate_encrypted_id(name, email=None, phone=None):
    email = email or "abc@xyz.com"
    phone = phone or "123456789"
    combined = f"{name}_{email}_{phone}"
    return hashlib.sha256(combined.encode()).hexdigest()

def get_top_matches(name, email, phone, df, top_n=5):
    # Generate Encrypted ID for the input user
    user_id = generate_encrypted_id(name, email, phone)

    # Check if this user exists in the dataset
    if user_id not in df["Encrypted_ID"].values:
        print("❌ No match found for this user in the dataset.")
        return pd.DataFrame()

    # Step 1: Select relevant matching features
    features = df[[
        "Gender", "Do you smoke", "Do you have pet", "Nationality",
        "Primary Language", "Dietary Preference", "Occupation", "Work Schedule"
    ]]

    # Step 2: One-hot encode the features
    encoder = OneHotEncoder()
    encoded_features = encoder.fit_transform(features).toarray()

    # Step 3: Calculate similarity matrix
    similarity_matrix = cosine_similarity(encoded_features)

    # Step 4: Create similarity DataFrame
    similarity_df = pd.DataFrame(similarity_matrix, index=df["Encrypted_ID"], columns=df["Encrypted_ID"])

    # Step 5: Get top N matches for the given user
    top_matches_series = similarity_df[user_id].sort_values(ascending=False)[1:top_n+1]
    top_match_ids = top_matches_series.index.tolist()
    top_match_scores = top_matches_series.values.round(2)

    # Step 6: Filter and return top matches
    top_matches_df = df[df["Encrypted_ID"].isin(top_match_ids)][["Encrypted_ID","Name","Email","Phone Number","Gender", "Nationality", "Do you smoke", "Do you have pet",
        "Primary Language", "Dietary Preference", "Occupation", "Work Schedule"
    ]]

    top_matches_df = top_matches_df.set_index("Encrypted_ID").loc[top_match_ids].reset_index()
    top_matches_df.insert(0, "Similarity Score", top_match_scores)

    return top_matches_df



In [45]:
# Example usage
top_matches = get_top_matches(
    name="Allison Hill",
    email="williamjohnson@example.org",
    phone="321.581.9600",
    df=df,  # your roommate dataset with Encrypted_ID
    top_n=5
)

print(top_matches)


   Similarity Score                                       Encrypted_ID  \
0              0.87  4959a4707c3523e54d3ddb542c5786c811d02344f094de...   
1              0.87  f5a6253e2e56d0542ff06d59801393090f18ccc87fb881...   
2              0.87  f151f54121bed58c7b5c46ec890536e82a232dd4a3aba9...   
3              0.87  fa9bc60f4c9f1bf79bf5bcdd176e5c1c06a0a375bc6411...   
4              0.87  55b664c39e1c732b6dba957b45a30fe06c89a7aa89c03e...   

               Name                     Email           Phone Number Gender  \
0  Kristen Thompson          jkim@example.net           397-971-8887   Male   
1     Destiny Johns        dkelly@example.net           752-313-8022   Male   
2     Pamela Lucero      yramirez@example.org      551.651.3510x5659   Male   
3       Lisa Strong  cochranjimmy@example.com  +1-305-766-8541x88292   Male   
4     Trevor Zuniga   wilsondebra@example.net  001-530-853-0877x8026   Male   

  Nationality Do you smoke Do you have pet Primary Language  \
0    Canadian    

In [46]:
import streamlit as st
import pandas as pd
import hashlib
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# --- Helper functions ---
def generate_encrypted_id(name, email=None, phone=None):
    email = email or "abc@xyz.com"
    phone = phone or "123456789"
    combined = f"{name}_{email}_{phone}"
    return hashlib.sha256(combined.encode()).hexdigest()

def get_top_matches(user_profile, df, top_n=5):
    # Add user's input to the dataset temporarily for comparison
    df_extended = df.copy()
    user_profile["Encrypted_ID"] = "user_input"
    df_extended = pd.concat([df_extended, pd.DataFrame([user_profile])], ignore_index=True)

    # Encode features
    features = df_extended[[
        "Gender", "Do you smoke", "Do you have pet", "Nationality",
        "Primary Language", "Dietary Preference", "Occupation", "Work Schedule"
    ]]
    encoder = OneHotEncoder()
    encoded_features = encoder.fit_transform(features).toarray()

    similarity_matrix = cosine_similarity(encoded_features)
    similarity_df = pd.DataFrame(similarity_matrix, index=df_extended["Encrypted_ID"], columns=df_extended["Encrypted_ID"])

    # Extract top matches for the user input
    top_matches_series = similarity_df.loc["user_input"].sort_values(ascending=False)[1:top_n+1]
    top_match_ids = top_matches_series.index.tolist()
    top_match_scores = top_matches_series.values.round(2)

    top_matches_df = df[df["Encrypted_ID"].isin(top_match_ids)][[
        "Encrypted_ID", "Name", "Email", "Phone Number", "Gender", "Nationality", "Do you smoke", "Do you have pet",
        "Primary Language", "Dietary Preference", "Occupation", "Work Schedule"
    ]]
    top_matches_df = top_matches_df.set_index("Encrypted_ID").loc[top_match_ids].reset_index()
    top_matches_df.insert(0, "Similarity Score", top_match_scores)
    return top_matches_df

# --- Streamlit UI ---
st.title("🏠 Roommate Recommendation System")

st.sidebar.header("🔍 Enter Your Preferences")
gender = st.sidebar.selectbox("Gender", ["Male", "Female", "Non-binary", "Prefer not to say"])
smoke = st.sidebar.selectbox("Do you smoke?", ["Yes", "No"])
pet = st.sidebar.selectbox("Do you have a pet?", ["Yes", "No"])
nationality = st.sidebar.text_input("Nationality")
language = st.sidebar.text_input("Primary Language")
diet = st.sidebar.selectbox("Dietary Preference", ["Vegetarian", "Vegan", "No restriction", "Others"])
occupation = st.sidebar.text_input("Occupation")
schedule = st.sidebar.selectbox("Work Schedule", ["Day", "Afternoon", "Overnight"])

if st.sidebar.button("Find My Top Matches"):
    try:
        df = pd.read_csv("synthetic_roommate_profiles.csv")

        # Prepare user profile
        user_profile = {
            "Gender": gender,
            "Do you smoke": smoke,
            "Do you have pet": pet,
            "Nationality": nationality,
            "Primary Language": language,
            "Dietary Preference": diet,
            "Occupation": occupation,
            "Work Schedule": schedule
        }

        matches = get_top_matches(user_profile, df)
        if not matches.empty:
            st.success("✅ Top Roommate Matches Found:")
            st.dataframe(matches)
    except Exception as e:
        st.error(f"Error: {str(e)}")


2025-06-09 19:11:26.998 
  command:

    streamlit run /Users/sohommandal/Library/Python/3.10/lib/python/site-packages/ipykernel_launcher.py [ARGUMENTS]
2025-06-09 19:11:27.135 Session state does not function when running a script without `streamlit run`
