In [None]:
# Feature Integration Code

import pandas as pd
import numpy as np

# 1. Load Data
df_combined = pd.read_csv("df_combined.csv")
df_survey1_agg = pd.read_csv("aggregated_survey1_features.csv")
df_survey2_agg = pd.read_csv("aggregated_survey2_features.csv")

# 2. Clean df_combined Gender Standardize 'target_audience' for joining
def standardize_gender(audience):
    if pd.isna(audience):
        return 'Unknown'
    # Check the audience column contains lists or strings
    audience_str = str(audience).lower()
    
    if 'female' in audience_str:
        return 'Female'
    elif 'male' in audience_str:
        return 'Male'
    elif 'unisex' in audience_str:
        return 'Unisex'
    else:
        return 'Unknown'

df_combined['Gender_Label'] = df_combined['target_audience'].apply(standardize_gender)

# 3. Integrate Survey 1 (Avg_Int5: Overall Purchase Intent Score)

# Simplify Survey 1 aggregation to Gender level
df_int5_lookup = df_survey1_agg.groupby('Gender_Label')['Avg_Int5'].mean().reset_index()
df_int5_lookup.rename(columns={'Avg_Int5': 'Overall_Purchase_Intent_Score'}, inplace=True)

# Let's map 'Unisex' in df_combined to the average of Male and Female scores from the survey for simplicity.
male_score = df_int5_lookup[df_int5_lookup['Gender_Label'] == 'Male']['Overall_Purchase_Intent_Score'].iloc[0]
female_score = df_int5_lookup[df_int5_lookup['Gender_Label'] == 'Female']['Overall_Purchase_Intent_Score'].iloc[0]
unisex_score = (male_score + female_score) / 2

# Add 'Unisex' score to the lookup table
df_int5_lookup.loc[len(df_int5_lookup)] = ['Unisex', unisex_score]

# Join this score to df_combined
df_combined = pd.merge(
    df_combined,
    df_int5_lookup[['Gender_Label', 'Overall_Purchase_Intent_Score']],
    left_on='Gender_Label',
    right_on='Gender_Label',
    how='left'
)

# 4. Integrate Survey 2 (Avg_Likelihood: Universal Influencer Score)

# Extract the column name reliably
inf_col_name = 'Avg_Likelihood_buy_product_Infleucer_recommendation'

# Calculate the universal mean across ALL groups in Survey 2
universal_influencer_score = df_survey2_agg[inf_col_name].mean()

# Add this single score as a new column to all rows of df_combined
df_combined['Universal_Influencer_Score'] = universal_influencer_score

# 5. Final Cleaning and Renaming
df_combined.drop(columns=['target_audience', 'Gender_Label'], inplace=True)
df_combined.rename(columns={'perfume_clean': 'Product_ID'}, inplace=True)

# 6. Save Final Table
df_combined.to_csv('df_final_features.csv', index=False)

print("Feature integration complete.")
print("The core dataset has been enriched with external survey features and saved as 'df_final_features.csv'.")
print("\n--- Final Feature Table Head ---")
print(df_combined.head().to_markdown(index=False))

  df_combined = pd.read_csv("df_combined.csv")


Feature integration complete.
The core dataset has been enriched with external survey features and saved as 'df_final_features.csv'.

--- Final Feature Table Head ---
| Product_ID       | brand_clean   | type_clean        | longevity   | Main Accords   |   Rating Value |   Rating Count |   Overall_Purchase_Intent_Score |   Universal_Influencer_Score |
|:-----------------|:--------------|:------------------|:------------|:---------------|---------------:|---------------:|--------------------------------:|-----------------------------:|
| nitro red        | dumont        | extrait de parfum | Strong      | fresh scent    |            nan |            nan |                         3.15217 |                      2.60352 |
| nitro pour homme | dumont        | extrait de parfum | Strong      | fresh scent    |            nan |            nan |                         3.15217 |                      2.60352 |
| nitro white      | dumont        | extrait de parfum | Strong      | fresh scent   