In [26]:
import pandas as pd
df=pd.read_csv('/kaggle/input/housing-price-dataset/Housing.csv')
df=df[[
    "id",
'price',
'bedrooms',
'bathrooms',
'sqft_living',
'floors',
'waterfront',
'view',
'condition',
]]
df = df.dropna() 
df=df.drop_duplicates()

In [27]:
# Calculate Q1 and Q3
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]

# Check shape after removing outliers
print("New shape after removing price outliers:", df.shape)



New shape after removing price outliers: (20470, 9)


In [28]:
df.to_csv('cleaned_housing.csv')

In [29]:
df=pd.read_csv('/kaggle/working/cleaned_housing.csv')

In [41]:
df

Unnamed: 0.1,Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,floors,waterfront,view,condition
0,0,7229300521,231300.0,2,1.00,1180,1.0,0,0,3
1,1,6414100192,538000.0,3,2.25,2570,2.0,0,0,3
2,2,5631500400,180000.0,2,1.00,770,1.0,0,0,3
3,3,2487200875,604000.0,4,3.00,1960,1.0,0,0,5
4,4,1954400510,510000.0,3,2.00,1680,1.0,0,0,3
...,...,...,...,...,...,...,...,...,...,...
20465,21608,263000018,360000.0,3,2.50,1530,3.0,0,0,3
20466,21609,6600060120,400000.0,4,2.50,2310,2.0,0,0,3
20467,21610,1523300141,402101.0,2,0.75,1020,2.0,0,0,3
20468,21611,291310100,400000.0,3,2.50,1600,2.0,0,0,3


In [36]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# --- Chargement et preprocessing ---
df = pd.read_csv('/kaggle/working/cleaned_housing.csv')
features = df.drop(columns=['id', 'Unnamed: 0'])
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# --- TensorFlow dataset ---
features_tensor = tf.convert_to_tensor(features_scaled, dtype=tf.float32)

# --- Génération de triplets à partir des données brutes ---
def generate_triplets(features, num_triplets=5000):
    triplets = []
    n = len(features)
    for _ in range(num_triplets):
        anchor = np.random.randint(0, n)
        positive = (anchor + 1) % n
        negative = np.random.randint(0, n)
        while negative == anchor or negative == positive:
            negative = np.random.randint(0, n)
        triplets.append((anchor, positive, negative))
    return np.array(triplets)

triplet_indices = generate_triplets(features_scaled, num_triplets=5000)
anchor = tf.gather(features_tensor, triplet_indices[:, 0])
positive = tf.gather(features_tensor, triplet_indices[:, 1])
negative = tf.gather(features_tensor, triplet_indices[:, 2])

triplet_dataset = tf.data.Dataset.from_tensor_slices((anchor, positive, negative)).batch(3000)

# --- Modèle d'embedding puissant ---
embedding_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(features.shape[1],)),  # ici 9
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32)
])

# --- Triplet Loss ---
def triplet_loss(a, p, n, margin=1.0):
    pos_dist = tf.reduce_sum(tf.square(a - p), axis=1)
    neg_dist = tf.reduce_sum(tf.square(a - n), axis=1)
    loss = tf.maximum(pos_dist - neg_dist + margin, 0.0)
    return tf.reduce_mean(loss)

# --- Entraînement ---
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
print("\n--- Entraînement avec Triplet Loss ---")
epochs = 200
for epoch in range(epochs):
    epoch_loss = 0
    for step, (a, p, n) in enumerate(triplet_dataset):
        with tf.GradientTape() as tape:
            a_embed = embedding_model(a)
            p_embed = embedding_model(p)
            n_embed = embedding_model(n)
            loss = triplet_loss(a_embed, p_embed, n_embed)
        grads = tape.gradient(loss, embedding_model.trainable_variables)
        optimizer.apply_gradients(zip(grads, embedding_model.trainable_variables))
        epoch_loss += loss.numpy()
    print(f"Epoch {epoch + 1}/{epochs} - Loss: {epoch_loss / (step + 1):.4f}")

# --- Embeddings finaux ---
final_embeddings = embedding_model(features_tensor).numpy()

# --- Recommandation (top-k voisins) ---
def recommend_similar(house_id, top_k=5):
    idx = df[df['id'] == house_id].index[0]
    query_embedding = final_embeddings[idx:idx+1]
    sims = cosine_similarity(query_embedding, final_embeddings)[0]
    sims[idx] = -np.inf  # Exclure la maison elle-même
    top_k_indices = np.argsort(sims)[-top_k:][::-1]
    return df.iloc[top_k_indices]

# --- Exemple ---
house_id = df['id'].iloc[0]
print(f"\nMaisons similaires à la maison avec ID {house_id} :")
print(recommend_similar(house_id, top_k=5))



--- Entraînement avec Triplet Loss ---
Epoch 1/200 - Loss: 1.2644
Epoch 2/200 - Loss: 1.0147
Epoch 3/200 - Loss: 0.9559
Epoch 4/200 - Loss: 0.9342
Epoch 5/200 - Loss: 0.9250
Epoch 6/200 - Loss: 0.9148
Epoch 7/200 - Loss: 0.9062
Epoch 8/200 - Loss: 0.8971
Epoch 9/200 - Loss: 0.8882
Epoch 10/200 - Loss: 0.8805
Epoch 11/200 - Loss: 0.8708
Epoch 12/200 - Loss: 0.8604
Epoch 13/200 - Loss: 0.8502
Epoch 14/200 - Loss: 0.8413
Epoch 15/200 - Loss: 0.8309
Epoch 16/200 - Loss: 0.8219
Epoch 17/200 - Loss: 0.8132
Epoch 18/200 - Loss: 0.8041
Epoch 19/200 - Loss: 0.7952
Epoch 20/200 - Loss: 0.7879
Epoch 21/200 - Loss: 0.7772
Epoch 22/200 - Loss: 0.7685
Epoch 23/200 - Loss: 0.7603
Epoch 24/200 - Loss: 0.7519
Epoch 25/200 - Loss: 0.7436
Epoch 26/200 - Loss: 0.7325
Epoch 27/200 - Loss: 0.7211
Epoch 28/200 - Loss: 0.7132
Epoch 29/200 - Loss: 0.7033
Epoch 30/200 - Loss: 0.6922
Epoch 31/200 - Loss: 0.6872
Epoch 32/200 - Loss: 0.6769
Epoch 33/200 - Loss: 0.6752
Epoch 34/200 - Loss: 0.6646
Epoch 35/200 - Lo

In [37]:
def evaluate_recommendations(df, final_embeddings, top_k=5, sample_ratio=0.2):
    reciprocal_ranks = []
    topk_hits = 0
    total = len(df)
    
    # Prendre un échantillon de 20% des données
    sample_size = int(sample_ratio * total)
    sampled_indices = np.random.choice(total, size=sample_size, replace=False)

    print(f"\n--- Évaluation sur un échantillon de {sample_size} maisons ---")
    for count, i in enumerate(sampled_indices):
        try:
            query_embedding = final_embeddings[i]
            house_id = df['id'].iloc[i]

            # Similarités avec toutes les maisons
            sims = cosine_similarity(query_embedding.reshape(1, -1), final_embeddings)[0]
            sims[i] = -np.inf  # on ignore la maison elle-même

            # Vrai plus proche voisin
            true_closest_idx = np.argmax(sims)
            true_closest_id = df['id'].iloc[true_closest_idx]

            # Recommandations du modèle
            recs = recommend_similar(house_id, top_k=top_k)
            rec_ids = recs['id'].values

            # Top-K hit
            if true_closest_id in rec_ids:
                topk_hits += 1

            # MRR
            if true_closest_id in rec_ids:
                rank = np.where(rec_ids == true_closest_id)[0][0] + 1
                reciprocal_ranks.append(1.0 / rank)
            else:
                reciprocal_ranks.append(0.0)

            if count % 100 == 0:
                print(f"{count}/{sample_size} maisons traitées...")

        except Exception as e:
            print(f"Erreur à l'index {i} : {e}")
            continue

    topk_accuracy = topk_hits / sample_size
    mrr = np.mean(reciprocal_ranks)

    print("\n--- Résultats de l'Évaluation ---")
    print(f"Top-{top_k} Accuracy : {topk_accuracy:.4f}")
    print(f"MRR : {mrr:.4f}")


In [38]:
evaluate_recommendations(df, final_embeddings, top_k=5)



--- Évaluation sur un échantillon de 4094 maisons ---
0/4094 maisons traitées...
100/4094 maisons traitées...
200/4094 maisons traitées...
300/4094 maisons traitées...
400/4094 maisons traitées...
500/4094 maisons traitées...
600/4094 maisons traitées...
700/4094 maisons traitées...
800/4094 maisons traitées...
900/4094 maisons traitées...
1000/4094 maisons traitées...
1100/4094 maisons traitées...
1200/4094 maisons traitées...
1300/4094 maisons traitées...
1400/4094 maisons traitées...
1500/4094 maisons traitées...
1600/4094 maisons traitées...
1700/4094 maisons traitées...
1800/4094 maisons traitées...
1900/4094 maisons traitées...
2000/4094 maisons traitées...
2100/4094 maisons traitées...
2200/4094 maisons traitées...
2300/4094 maisons traitées...
2400/4094 maisons traitées...
2500/4094 maisons traitées...
2600/4094 maisons traitées...
2700/4094 maisons traitées...
2800/4094 maisons traitées...
2900/4094 maisons traitées...
3000/4094 maisons traitées...
3100/4094 maisons traitées.

In [39]:
def recommend_by_features(input_features_dict, top_k=10):
    """
    input_features_dict: a dict where keys are feature names and values are feature values.
    Example:
    {
        'bedrooms': 3,
        'bathrooms': 2,
        'sqft_living': 1500,
        ...
    }
    """
    # Convert input features into same order as training data
    input_df = pd.DataFrame([input_features_dict])
    input_scaled = scaler.transform(input_df)
    input_tensor = tf.convert_to_tensor(input_scaled, dtype=tf.float32)

    # Get embedding
    input_embedding = embedding_model(input_tensor).numpy()

    # Compute cosine similarity
    sims = cosine_similarity(input_embedding, final_embeddings)[0]
    top_k_indices = np.argsort(sims)[-top_k:][::-1]
    
    return df.iloc[top_k_indices]


In [51]:
# Example input — adapt based on your dataset's columns
input_features = {
     'price':16500,
    'bedrooms': 1.5,
    'bathrooms': 1,
    'sqft_living': 1000,
    
    'floors': 1,
    'waterfront': 0,
    'view': 0,
    'condition': 3,
  
}



top_similar = recommend_by_features(input_features, top_k=10)
print(top_similar)


       Unnamed: 0          id     price  bedrooms  bathrooms  sqft_living  \
15186       15986    87000213  129000.0         2        1.0         1150   
2459         2589  5061300030  134000.0         2        1.5          980   
5528         5816  7568700480  153000.0         2        1.0         1140   
13861       14581  6929602721   95000.0         2        1.0          960   
3229         3397  2172000750  160000.0         2        1.0         1180   
12923       13601  8698600395  150000.0         2        1.0         1250   
15864       16714  1322049150   85000.0         2        1.0          910   
9605        10105  5466310060  139500.0         2        1.5         1230   
17524       18468  7999600180   83000.0         2        1.0          900   
2954         3108  1721801591   89950.0         1        1.0          570   

       floors  waterfront  view  condition  
15186     1.0           0     0          3  
2459      2.0           0     0          3  
5528      1.0    