In [None]:
import requests
from datetime import datetime, timedelta
import pandas as pd
from dotenv import load_dotenv
import os
from tqdm import tqdm
import zipfile
from pathlib import Path
import glob
import jquantsapi
import inspect
from bs4 import BeautifulSoup
from io import StringIO
import re
import json
from sentence_transformers import SentenceTransformer
import numpy as np
import seaborn as sns

In [None]:
lands_df = pd.read_excel('lands_cleansed.xlsx')
land_price_df = pd.read_excel('land_price.xlsx')

In [None]:
model = SentenceTransformer('sonoisa/sentence-bert-base-ja-mean-tokens-v2', device='cuda:1')
query_embeddings = model.encode(lands_df['loc'])
reference_embeddings = model.encode(land_price_df['name'])
similarity_matrix = model.similarity(query_embeddings, reference_embeddings)

In [None]:
best_match_idx = np.argmax(similarity_matrix, axis=1)
best_match_score = similarity_matrix[np.arange(similarity_matrix.shape[0]), best_match_idx]
best_match = land_price_df.iloc[best_match_idx]
best_match = best_match.reset_index(drop=True).rename(columns={'name': 'matched_address', 'price': 'matched_price'})
best_match['matched_score'] = best_match_score

In [None]:
matched_df = pd.concat([lands_df, best_match], axis=1)

In [None]:
matched_df['帳簿価額'] = matched_df['price'] * matched_df['price_unit']
matched_df['推定時価'] = matched_df['matched_price'] * matched_df['area'] * matched_df['area_unit']
matched_df['推定評価益'] = matched_df['推定時価'] - matched_df['帳簿価額']

In [None]:
matched_df.to_excel('land_matched.xlsx', index=False)

In [None]:
filtered_df = matched_df[matched_df['matched_score'] >= 0.7]
appreciations = df.groupby('doc_id')['推定評価益'].sum()

In [None]:
basic_df = pd.read_excel('basic_data.xlsx')

In [None]:
new_df = pd.merge(basic_df, appreciations, left_on='書類番号', right_on='doc_id')
new_df.to_excel('basic_data_and_appreciations.xlsx', index=False)