In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
pickle_name = "sample_merged_full_10k"
base_dir = "/home/user/core/data"

In [3]:
# 保存したDataFrameの読み込み
with open(f"{base_dir}/{pickle_name}.pkl", "rb") as f:
    df = pickle.load(f)

In [4]:
df.head()

Unnamed: 0,user_id,target_id,rating,user_name_target,nickname_target,gender_target,location_target,age_range_target,height_range_target,body_type_target,...,body_type_user,personality_user,appearance_user,job_user,blood_type_user,car_user,interests_user,salary_user,plan_user,account_creation_timestamp_user
0,1,8627,0,原田遥,アオイ,女性,埼玉県伊奈町,45-49,150-154,普通,...,スリム,元気,セクシー系,会社員,O型,有り,技術・プログラミング,8160000,option2,2024-01-14 00:11:34
1,1,18213,0,井上萌,ユイ,女性,福島県玉川村,30-34,150-154,スリム,...,スリム,元気,セクシー系,会社員,O型,有り,技術・プログラミング,8160000,option2,2024-01-14 00:11:34
2,1,18859,1,田中美優,ハルカ,女性,香川県まんのう町,45-49,150-154,細身,...,スリム,元気,セクシー系,会社員,O型,有り,技術・プログラミング,8160000,option2,2024-01-14 00:11:34
3,1,23263,0,森夏子,エミ,女性,愛知県瀬戸市,22-25,155-159,普通,...,スリム,元気,セクシー系,会社員,O型,有り,技術・プログラミング,8160000,option2,2024-01-14 00:11:34
4,1,36543,1,松本愛子,アオイ,女性,栃木県さくら市,26-29,170-174,ちょいポチャ,...,スリム,元気,セクシー系,会社員,O型,有り,技術・プログラミング,8160000,option2,2024-01-14 00:11:34


In [19]:
user_columns = [col for col in df.columns if col == 'user_id' or col.endswith('_user')]
user_df = df[user_columns]

# Remove duplicate rows based on 'user_id'
user_df_unique = user_df.drop_duplicates(subset=['user_id'])

In [20]:
user_df_unique

Unnamed: 0,user_id,user_name_user,nickname_user,gender_user,location_user,age_range_user,height_range_user,body_type_user,personality_user,appearance_user,job_user,blood_type_user,car_user,interests_user,salary_user,plan_user,account_creation_timestamp_user
0,1,山口陽介,ヨウ,男性,岐阜県各務原市,22-25,175-179,スリム,元気,セクシー系,会社員,O型,有り,技術・プログラミング,8160000,option2,2024-01-14 00:11:34
18,2,伊藤花子,アミ,女性,岡山県久米南町,45-49,160-164,普通,クール,癒し系,主婦,A型,無し,"旅行・ドライブ, DIY・クラフト",7080000,option2,2024-01-22 18:21:22
36,3,長谷川莉子,ハルカ,女性,高知県安田町,30-34,170-174,ちょいポチャ,クール,癒し系,主婦,A型,無し,"学習・自己啓発, ファッション・美容, カフェ・ショッピング",4070000,free,2024-02-25 20:34:25
42,4,中村秋子,ハル,女性,兵庫県新温泉町,35-39,150-154,ちょいポチャ,おとなしい,かわいい系,自営業,B型,無し,"音楽・映画, 音楽・映画",7910000,option2,2024-02-13 20:36:45
57,5,中村秋子,ハナ,女性,群馬県吉岡町,40-44,165-169,普通,真面目,セクシー系,主婦,A型,無し,特になし,6550000,free,2024-04-13 19:24:18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108507,9995,石川八郎,カズ,男性,茨城県神栖市,35-39,175-179,普通,やさしい,知的系,エンジニア,A型,有り,フィットネス・健康,11920000,free,2024-05-18 23:09:18
108515,9996,佐藤拓海,ナオ,男性,北海道色丹村,45-49,175-179,普通,元気,癒し系,経営者,A型,無し,"悩み相談, 趣味友達",11190000,option1,2024-04-28 21:31:38
108522,9997,藤田智也,ナオ,男性,京都府和束町,26-29,160-164,普通,元気,かわいい系,教職員,A型,有り,"料理・飲食, その他",7870000,option2,2024-03-04 01:21:57
108540,9998,木村美咲,ミカ,女性,高知県北川村,22-25,180-184,スリム,面白い,かわいい系,医療従事者,A型,無し,ファッション・美容,5820000,free,2024-01-03 01:46:38


In [34]:
def age_range_to_numeric(age_range):
    if '-' in age_range:
        start, end = map(int, age_range.split('-'))
        return (start + end) / 2
    else:
        return int(age_range)

# Initialize the scaler
#scaler = StandardScaler()
# Initialize the MinMaxScaler
min_max_scaler = MinMaxScaler()

#selected_columns = ['user_id','age_range_user', 'salary_user', 'interests_user']
selected_columns = ['user_id','age_range_user', 'salary_user']

selected_df = user_df_unique[selected_columns]

# Fit and transform the numeric data
selected_df['age_numeric'] = selected_df['age_range_user'].apply(age_range_to_numeric)
#user_df_unique['age_numeric'] = user_df_unique['age_range_user'].apply(age_range_to_numeric)


# Normalize the new numeric age column and salary column
selected_df['age_numeric_normalized'] = min_max_scaler.fit_transform(selected_df[['age_numeric']])
selected_df['salary_user_normalized'] = min_max_scaler.fit_transform(selected_df[['salary_user']])
#user_df_unique['age_numeric_normalized'] = min_max_scaler.fit_transform(user_df_unique[['age_numeric']])
#user_df_unique['salary_user_normalized'] = min_max_scaler.fit_transform(user_df_unique[['salary_user']])

selected_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['age_numeric'] = selected_df['age_range_user'].apply(age_range_to_numeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['age_numeric_normalized'] = min_max_scaler.fit_transform(selected_df[['age_numeric']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['salary

Unnamed: 0,user_id,age_range_user,salary_user,age_numeric,age_numeric_normalized,salary_user_normalized
0,1,22-25,8160000,23.5,0.000000,0.515879
18,2,45-49,7080000,47.0,0.824561,0.432223
36,3,30-34,4070000,32.0,0.298246,0.199070
42,4,35-39,7910000,37.0,0.473684,0.496514
57,5,40-44,6550000,42.0,0.649123,0.391170
...,...,...,...,...,...,...
108507,9995,35-39,11920000,37.0,0.473684,0.807126
108515,9996,45-49,11190000,47.0,0.824561,0.750581
108522,9997,26-29,7870000,27.5,0.140351,0.493416
108540,9998,22-25,5820000,23.5,0.000000,0.334624


In [46]:
# Vectorize the 'interests_user' column
vectorizer = TfidfVectorizer()

interests_vectors = vectorizer.fit_transform(user_df_unique['interests_user'])

# Convert to DataFrame for easier handling
interests_df = pd.DataFrame(interests_vectors.toarray(), columns=vectorizer.get_feature_names_out())
interests_df

Unnamed: 0,diy,その他,アウトドア,アニメ,アート,カフェ,クラフト,ゲーム,ショッピング,スポーツ,...,漫画,特になし,社会活動,美容,自己啓発,読書,趣味友達,遊び友達,音楽,飲食
0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.00000
1,0.541006,0.000000,0.0,0.0,0.0,0.000000,0.541006,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.00000
2,0.000000,0.000000,0.0,0.0,0.0,0.379119,0.000000,0.0,0.379119,0.0,...,0.0,0.0,0.0,0.415079,0.428926,0.0,0.000000,0.00000,0.000000,0.00000
3,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.707107,0.00000
4,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,1.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9815,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.00000
9816,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.696755,0.00000,0.000000,0.00000
9817,0.000000,0.649395,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.53772
9818,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.707107,0.000000,0.0,0.000000,0.00000,0.000000,0.00000


In [None]:
# interests_dfをselected_dfに結合
selected_df = pd.concat([selected_df.reset_index(drop=True), interests_df.reset_index(drop=True)], axis=1)

In [35]:
# Combine salary and age for similarity calculation
#user_df_unique['combined_features'] = user_df_unique['salary_user_normalized'] + user_df_unique['age_numeric_normalized']
#user_df_unique['combined_features'] = user_df_unique['salary_user_normalized']
#combined_features = np.hstack((user_df_unique[['salary_user_normalized']], user_df_unique[['age_numeric_normalized']]))

# 不要なカラムを削除
_ = selected_columns.pop(0)
selected_df = selected_df.drop(columns=selected_columns)
selected_df

Unnamed: 0,user_id,age_numeric,age_numeric_normalized,salary_user_normalized
0,1,23.5,0.000000,0.515879
18,2,47.0,0.824561,0.432223
36,3,32.0,0.298246,0.199070
42,4,37.0,0.473684,0.496514
57,5,42.0,0.649123,0.391170
...,...,...,...,...
108507,9995,37.0,0.473684,0.807126
108515,9996,47.0,0.824561,0.750581
108522,9997,27.5,0.140351,0.493416
108540,9998,23.5,0.000000,0.334624


In [39]:
# DataFrameのすべてのカラムをnp.hstackで結合
# combined_features = np.hstack([selected_df[col].values.reshape(-1, 1) for col in selected_df.columns])
combined_features = np.hstack((selected_df[['salary_user_normalized']], selected_df[['age_numeric_normalized']]))

# Calculate the similarity matrix using combined features
#combined_similarity_matrix = cosine_similarity(user_df_unique[['combined_features']])
combined_similarity_matrix = cosine_similarity(combined_features)

# Convert to DataFrame for easier handling
combined_similarity_df = pd.DataFrame(combined_similarity_matrix, index=user_df_unique['user_id'], columns=user_df_unique['user_id'])
combined_similarity_df.head(2)

user_id,1,2,3,4,5,6,7,8,9,10,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.464268,0.555164,0.723546,0.51614,0.965616,1.0,0.978757,0.808512,0.966565,...,0.974381,0.838712,0.797734,0.841202,0.874695,0.862446,0.673154,0.961845,1.0,0.627124
2,0.464268,1.0,0.994414,0.947293,0.998229,0.678563,0.464268,0.635993,0.89658,0.675855,...,0.651572,0.871715,0.904445,0.869457,0.835366,0.8487,0.967497,0.688875,0.464268,0.981038


In [40]:
target_user_id=1
num_recommendations=5

def recommend_similar_users_combined(target_user_id, num_recommendations=5):
    # Get the gender of the target user
    target_user_gender = user_df_unique.loc[user_df_unique['user_id'] == target_user_id, 'gender_user'].values[0]

    # Determine the opposite gender
    opposite_gender = '男性' if target_user_gender == '女性' else '女性'

    # Filter the similarity matrix to include only users of the opposite gender
    opposite_gender_users = user_df_unique[user_df_unique['gender_user'] == opposite_gender]['user_id']
    #similarity_scores = combined_similarity_df.loc[target_user_id, opposite_gender_users]
    similarity_scores = combined_similarity_df[target_user_id][opposite_gender_users]

    # Sort the users based on similarity scores in descending order
    similar_users_opposite_gender = similarity_scores.sort_values(ascending=False)

    # Get the top N similar users
    top_similar_users_opposite_gender = similar_users_opposite_gender.head(num_recommendations).index.tolist()

    return top_similar_users_opposite_gender

# Example: Recommend similar users of the opposite gender for a specific user_id (e.g., user_id=1)
recommended_users_combined = recommend_similar_users_combined(target_user_id=target_user_id, num_recommendations=num_recommendations)

# Extract the profiles of the recommended users
recommended_profiles = user_df_unique[user_df_unique['user_id'].isin(recommended_users_combined)]
recommended_profiles

Unnamed: 0,user_id,user_name_user,nickname_user,gender_user,location_user,age_range_user,height_range_user,body_type_user,personality_user,appearance_user,job_user,blood_type_user,car_user,interests_user,salary_user,plan_user,account_creation_timestamp_user
36433,3340,原田美香,リコ,女性,鳥取県大山町,22-25,175-179,細身,明るい,おしゃれ系,大学生,A型,無し,"技術・プログラミング, 旅行・ドライブ",4970000,free,2024-02-19 22:26:14
36462,3344,木村美優,アヤノ,女性,鹿児島県日置市,22-25,160-164,普通,明るい,スポーツ系,大学生,A型,無し,特になし,3500000,option1,2024-05-28 19:47:24
41129,3784,加藤葵,ハナ,女性,長崎県島原市,22-25,160-164,普通,元気,さわやか系,家事手伝い,B型,無し,"婚活, ペット・動物, その他",5030000,free,2024-02-12 19:01:26
64390,5905,原田愛子,アヤ,女性,和歌山県岩出市,22-25,165-169,スリム,かわいい,セクシー系,会社員,AB型,無し,遊び友達,7210000,free,2024-02-07 23:51:13
64752,5940,佐藤亜美,カエデ,女性,石川県七尾市,22-25,165-169,スリム,面白い,知的系,会社員,O型,有り,"音楽・映画, スポーツ・アウトドア",8910000,option2,2024-02-02 05:54:38
