# Feature Engineering

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Load your cleaned dataset from previous step
df = pd.read_csv("roommates_clean_for_tableau.csv")

# ---- 2.1: choose columns you'll use (MATCHED TO YOUR HEADERS)
cols = {
    "id": "Name",                  # identifier
    "gender_pref": "roommate_pref",
    "food": "food_pref",
    "smoker": "Smoker",
    "alcohol": "alcohol_friendly",
    "loud": "loud_music_allowed",
    "housing": "looking_at",
    "campus": "asu_campus",
    "program": "program_name",     # use program (you don't have major/degree split)
    "city": "City",
    "state": "State",
    "exp": "work_experience",
    "age": "Age",
    "gender": "Gender",            # optional (hard filters later)
}

# ---- 2.2: ordinal mappings
map_yesno   = {"Yes": 1, "No": 0}
map_alcohol = {"No": 0, "Occasionally": 1, "Yes": 2}

feat = pd.DataFrame()
feat["id"] = df[cols["id"]]

# Safe numeric mapping with fillna
feat["smoker_bin"]  = df[cols["smoker"]].map(map_yesno).fillna(0)
feat["alcohol_lvl"] = df[cols["alcohol"]].map(map_alcohol).fillna(1)  # neutral default
feat["loud_bin"]    = df[cols["loud"]].map(map_yesno).fillna(0)

# numeric normalize work experience & age (robust to non-numeric)
for c_src, c_dst in [(cols["exp"], "exp"), (cols["age"], "age")]:
    x = pd.to_numeric(df[c_src], errors="coerce")
    if x.notna().sum() > 0 and x.max() != x.min():
        x = (x - x.min()) / (x.max() - x.min())
    feat[c_dst] = x.fillna(0)

# ---- 2.3: food compatibility as ordinal
food_map = {"Vegetarian": 2.0, "Eggetarian": 1.5, "Non-vegetarian": 1.0}
feat["food_lvl"] = df[cols["food"]].map(food_map).fillna(1.0)

# ---- 2.4: one-hot categorical for soft similarity (use only columns that exist)
possible_cat_cols = {
    "campus": cols["campus"],
    "housing": cols["housing"],
    "program": cols["program"],
    "city": cols["city"],
    "state": cols["state"],
}
use_cats = [c for c in possible_cat_cols.values() if c in df.columns]

if use_cats:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    X_cat = ohe.fit_transform(df[use_cats].fillna("NA"))
    cat_names = ohe.get_feature_names_out(use_cats)
else:
    X_cat = np.zeros((len(df), 0))
    cat_names = np.array([])

# ---- 2.5: build final feature matrix
X_num = feat[["smoker_bin", "alcohol_lvl", "loud_bin", "exp", "age", "food_lvl"]].to_numpy()
X = np.hstack([X_num, X_cat])

# optional: feature weights (tune these!)
# build weights for numeric part + simple rules for categoricals
w_num = np.array([1.5, 1.2, 1.0, 0.5, 0.5, 1.3])  # smoker, alcohol, loud, exp, age, food
w_cat = []
for n in cat_names:
    base = 1.0
    if n.startswith(cols["campus"] + "_"):   base = 2.0
    elif n.startswith(cols["housing"] + "_"):base = 1.1
    elif n.startswith(cols["program"] + "_"):base = 0.9
    elif n.startswith(cols["state"] + "_"):  base = 0.4
    elif n.startswith(cols["city"] + "_"):   base = 0.2
    w_cat.append(base)
w = np.concatenate([w_num, np.array(w_cat)]) if len(w_cat) else w_num

Xw = X * w
S = cosine_similarity(Xw)  # similarity matrix (n x n)

In [4]:
def passes_hard_filters(i, j):
    """
    Hard constraints using your schema:
      - roommate_pref (Male/Female/Doesn’t matter) vs Gender
      - food strictness: Vegetarian blocks Non-vegetarian (both directions)
      - alcohol: "No" blocks "Yes" (both directions)
      - (optional) add more rules as needed
    """
    # --- Safe getters ---
    def get(row_idx, col):
        return df.loc[row_idx, col] if col in df.columns else np.nan

    # --- Gender preference rule ---
    gi = get(i, "Gender")
    gj = get(j, "Gender")
    pi = get(i, cols["gender_pref"])   # roommate_pref
    pj = get(j, cols["gender_pref"])

    def gender_ok(pref, other_gender):
        if pd.isna(pref) or pref == "Doesn’t matter" or pd.isna(other_gender):
            return True
        return pref == other_gender

    if not gender_ok(pi, gj):
        return False
    if not gender_ok(pj, gi):
        return False

    # --- Food strictness: Vegetarian blocks Non-vegetarian ---
    fi = get(i, cols["food"])   # food_pref
    fj = get(j, cols["food"])
    if fi == "Vegetarian" and fj == "Non-vegetarian":
        return False
    if fj == "Vegetarian" and fi == "Non-vegetarian":
        return False

    # --- Alcohol: "No" blocks "Yes" (both ways) ---
    ai = get(i, cols["alcohol"])   # alcohol_friendly
    aj = get(j, cols["alcohol"])
    if ai == "No" and aj == "Yes":
        return False
    if aj == "No" and ai == "Yes":
        return False

    # --- (Optional) Smoker rule ---
    # If you later add a "smoker_ok" preference, apply it here.
    # Currently we only have status (Smoker Yes/No), so we keep it soft in the score.

    return True

# Match Recommendation

In [5]:
def top_k_matches(k=5):
    n = len(df)
    results = {}
    for i in range(n):
        # candidates that pass hard filters
        cands = [j for j in range(n) if j != i and passes_hard_filters(i, j)]
        # sort by similarity
        cands = sorted(cands, key=lambda j: S[i, j], reverse=True)[:k]

        # human-readable reasons
        rows = []
        for j in cands:
            reasons = []

            # campus & housing alignment (if present)
            if cols["campus"] in df.columns and df.loc[i, cols["campus"]] == df.loc[j, cols["campus"]]:
                reasons.append(f"Same campus: {df.loc[i, cols['campus']]}")
            if cols["housing"] in df.columns and df.loc[i, cols["housing"]] == df.loc[j, cols["housing"]]:
                reasons.append(f"Same housing: {df.loc[i, cols['housing']]}")

            # food
            fi = df.loc[i, cols["food"]]
            fj = df.loc[j, cols["food"]]
            if pd.notna(fi) and pd.notna(fj):
                if fi == fj:
                    reasons.append(f"Food: both {fi}")
                else:
                    reasons.append(f"Food compatible: {fi} & {fj}")

            # alcohol (your column: alcohol_friendly)
            ai = df.loc[i, cols["alcohol"]]
            aj = df.loc[j, cols["alcohol"]]
            if pd.notna(ai) and ai == aj:
                reasons.append(f"Alcohol: both {ai}")

            # loud music (optional)
            if cols["loud"] in df.columns:
                li = df.loc[i, cols["loud"]]
                lj = df.loc[j, cols["loud"]]
                if pd.notna(li) and li == lj:
                    reasons.append(f"Loud music: both {li}")

            # program/major (use program_name if major not present)
            major_col = cols.get("major")
            if major_col and major_col in df.columns:
                mi, mj = df.loc[i, major_col], df.loc[j, major_col]
                if pd.notna(mi) and mi == mj:
                    reasons.append(f"Same major: {mi}")
            else:
                # fall back to program_name
                pi, pj = df.loc[i, cols["program"]], df.loc[j, cols["program"]]
                if pd.notna(pi) and pi == pj:
                    reasons.append(f"Same program: {pi}")

            rows.append({
                "me": df.loc[i, cols["id"]],
                "candidate": df.loc[j, cols["id"]],
                "score": round(float(S[i, j]), 3),
                "reasons": " | ".join([r for r in reasons if r][:4])  # keep concise
            })
        results[df.loc[i, cols["id"]]] = rows
    return results

# Build matches dict and flatten to a DataFrame
matches = top_k_matches(k=5)

out_rows = []
for me, recs in matches.items():
    out_rows.extend(recs)

match_df = pd.DataFrame(out_rows).sort_values(["me", "score"], ascending=[True, False]).reset_index(drop=True)


        me                     candidate  score  \
0  A Agraw  soum Information Technologyh  0.983   
1  A Agraw                        Tejesh  0.982   
2  A Agraw                       R Mathu  0.982   
3  A Agraw                       H Bolis  0.982   
4  A Agraw                        Vidhin  0.982   
5  A Basru                       S Sutar  1.000   
6  A Basru                       A Dhava  0.983   
7  A Basru                        A Seth  0.983   
8  A Basru                       V Krish  0.982   
9  A Basru                       N Gagla  0.893   

                                             reasons  
0  Same campus: tempe campus | Same housing: off-...  
1  Same campus: tempe campus | Same housing: off-...  
2  Same campus: tempe campus | Same housing: off-...  
3  Same campus: tempe campus | Same housing: off-...  
4  Same campus: tempe campus | Same housing: off-...  
5  Same campus: tempe campus | Same housing: on-c...  
6  Same campus: tempe campus | Same housing: on-c... 

In [8]:

# Preview and (optional) save
print(match_df.head(10))
match_df.to_csv("roommate_top5_matches.csv", index=False)
# print("✅ Saved roommate_top5_matches.csv")

        me                     candidate  score  \
0  A Agraw  soum Information Technologyh  0.983   
1  A Agraw                        Tejesh  0.982   
2  A Agraw                       R Mathu  0.982   
3  A Agraw                       H Bolis  0.982   
4  A Agraw                        Vidhin  0.982   
5  A Basru                       S Sutar  1.000   
6  A Basru                       A Dhava  0.983   
7  A Basru                        A Seth  0.983   
8  A Basru                       V Krish  0.982   
9  A Basru                       N Gagla  0.893   

                                             reasons  
0  Same campus: tempe campus | Same housing: off-...  
1  Same campus: tempe campus | Same housing: off-...  
2  Same campus: tempe campus | Same housing: off-...  
3  Same campus: tempe campus | Same housing: off-...  
4  Same campus: tempe campus | Same housing: off-...  
5  Same campus: tempe campus | Same housing: on-c...  
6  Same campus: tempe campus | Same housing: on-c... 