In [2]:
%pip install -q langchain_huggingface

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\FPL\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


### Load text data

In [21]:
from langchain_core.documents import Document

texts = [
    "사과", "바나나", "오이", "옥수수", "레몬", "시금치", "배추"
]

texts_en = ["Apple", "Banana", "Cucumber", "Corn", "Lemon", "Spinach", "Cabbage"]

all_documents = [Document(page_content=x, metadata={'idx':i}) for i, x in enumerate(texts)]

all_documents_en = [Document(page_content=x, metadata={'idx':i}) for i, x in enumerate(texts_en)]

### Embedding model

In [5]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

model_name = "intfloat/multilingual-e5-large-instruct"
# model_name = "intfloat/multilingual-e5-large"

hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cpu"},  # cuda, cpu
    encode_kwargs={"normalize_embeddings": True},
)

In [8]:
hf_embeddings_ko = HuggingFaceEmbeddings(model_name="jhgan/ko-sroberta-multitask")



In [6]:
%time
# Document
embedded_documents1 = hf_embeddings.embed_documents(texts)
embedded_documents1

CPU times: total: 0 ns
Wall time: 0 ns


[[0.05119206756353378,
  0.011308941058814526,
  0.012826292775571346,
  -0.03129682317376137,
  0.024386808276176453,
  -0.03183961659669876,
  -0.041372448205947876,
  0.007222211919724941,
  0.04447771608829498,
  -0.023085804656147957,
  0.02879766933619976,
  0.010783975012600422,
  -0.013323920778930187,
  -0.0284623634070158,
  -0.046949226409196854,
  -0.021683238446712494,
  -0.025736132636666298,
  0.0019590319134294987,
  0.0007779198931530118,
  -0.011368496343493462,
  0.026559391990303993,
  0.0023800362832844257,
  -0.006590370554476976,
  -0.050234075635671616,
  -0.030206551775336266,
  -0.003619119990617037,
  -0.050067950040102005,
  -0.01569143310189247,
  -0.018718471750617027,
  -0.026770103722810745,
  0.02732286974787712,
  0.007506833877414465,
  -0.03852766752243042,
  -0.040268849581480026,
  -0.012338870204985142,
  0.023780295625329018,
  0.0235087051987648,
  0.029794204980134964,
  -0.01239998172968626,
  0.06362660229206085,
  -0.015463152900338173,
  0.

In [24]:
# Document
embedded_documents2 = hf_embeddings_ko.embed_documents(texts)
embedded_documents2

embedded_documents_en = hf_embeddings_ko.embed_documents(texts_en)


### Vectorstore

In [27]:
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_documents(documents=all_documents, embedding=hf_embeddings_ko)
retriever = vectorstore.as_retriever()

In [25]:
from langchain_community.vectorstores import FAISS
vectorstore_en = FAISS.from_documents(documents=all_documents_en, embedding=hf_embeddings_ko)
retriever_en = vectorstore_en.as_retriever()

In [32]:
res_l2 = vectorstore.similarity_search_with_score('과일')

print(res_l2[0][0])
print(res_l2[0][1])

page_content='사과' metadata={'idx': 0}
85.54238


In [29]:
vectorstore_en.similarity_search_with_score('fruits')

[(Document(metadata={'idx': 2}, page_content='Cucumber'), 148.49947),
 (Document(metadata={'idx': 5}, page_content='Spinach'), 153.03789),
 (Document(metadata={'idx': 4}, page_content='Lemon'), 165.45874),
 (Document(metadata={'idx': 6}, page_content='Cabbage'), 173.9815)]

### PCA를 이용한 차원 축소

In [2]:
from sklearn.decomposition import PCA
import numpy as np

# 데이터 준비 (예: X는 고차원 데이터)
X = np.random.rand(100, 10)  # 100개의 10차원 데이터 예시

# PCA 모델 생성 및 적용
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)


In [13]:
X

array([[0.69857087, 0.83053181, 0.33661175, 0.5120801 , 0.69667805,
        0.07255907, 0.50370729, 0.04418376, 0.79152127, 0.43647321],
       [0.90555095, 0.86278346, 0.48876774, 0.70191413, 0.78731672,
        0.48182255, 0.53212186, 0.39988813, 0.70787548, 0.45419509],
       [0.66381014, 0.52386129, 0.37034822, 0.10448069, 0.40499923,
        0.16643385, 0.30509287, 0.16405242, 0.99644611, 0.05050746],
       [0.04167535, 0.196921  , 0.99489824, 0.19915537, 0.71025979,
        0.04952797, 0.6491397 , 0.89819845, 0.95168726, 0.96599611],
       [0.26808423, 0.3140957 , 0.61867394, 0.68249926, 0.52016039,
        0.90909582, 0.7788962 , 0.10313274, 0.39008957, 0.96816954],
       [0.456245  , 0.69294284, 0.67836486, 0.52690823, 0.10540833,
        0.5901577 , 0.18622027, 0.2102965 , 0.26253006, 0.52321447],
       [0.7184501 , 0.93481994, 0.06531642, 0.96003777, 0.43780427,
        0.76001433, 0.40813123, 0.44882448, 0.66019723, 0.96189028],
       [0.13238321, 0.60789449, 0.1827201

In [7]:
X_pca

array([[ 4.67978920e-01,  1.87101640e-03, -4.47134334e-01],
       [ 9.36153260e-02, -2.62588680e-01, -9.16295667e-02],
       [ 3.94374113e-01,  1.70844530e-01, -5.69848000e-01],
       [ 5.89709607e-01,  6.17985441e-02,  7.53737992e-01],
       [-2.26299350e-04,  1.95861489e-01,  2.71074379e-01],
       [-1.56685628e-01, -1.63860181e-01, -3.63931722e-01],
       [-1.08376946e-01, -4.74595930e-01, -7.80179281e-02],
       [ 4.88805866e-01,  7.75497416e-01, -1.53194553e-01],
       [-6.72927601e-01,  3.62082135e-01,  2.15867455e-01],
       [ 2.15309552e-01, -5.20016776e-02,  1.53374223e-01],
       [ 4.27791053e-01,  4.84470585e-01,  2.35312892e-01],
       [ 4.61134396e-01,  2.92079786e-01,  2.82792779e-02],
       [-1.67276643e-01,  3.68986816e-01,  2.79888404e-01],
       [ 2.48916203e-01, -7.32632974e-02, -3.07240353e-01],
       [ 4.92827075e-03, -3.67121167e-01, -5.58275639e-01],
       [ 1.93291767e-01,  1.76269664e-01,  1.13294435e-02],
       [ 4.70127619e-01,  6.43997714e-02

### Plotly 3D 시각화

In [37]:
%pip install plotly nbformat>=4.2.0

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\FPL\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [6]:
import plotly.express as px
import pandas as pd

# 데이터프레임 생성
df = pd.DataFrame(X_pca, columns=['PC1', 'PC2', 'PC3'])
df['idx'] = [x for x in range(len(X_pca))]

# 3D 산점도 생성
fig = px.scatter_3d(df, x='PC1', y='PC2', z='PC3'
                    ,title='3D PCA Visualization'
                    ,symbol='idx')
fig.show()


In [9]:
df.values

array([[ 4.67978920e-01,  1.87101640e-03, -4.47134334e-01,
         0.00000000e+00],
       [ 9.36153260e-02, -2.62588680e-01, -9.16295667e-02,
         1.00000000e+00],
       [ 3.94374113e-01,  1.70844530e-01, -5.69848000e-01,
         2.00000000e+00],
       [ 5.89709607e-01,  6.17985441e-02,  7.53737992e-01,
         3.00000000e+00],
       [-2.26299350e-04,  1.95861489e-01,  2.71074379e-01,
         4.00000000e+00],
       [-1.56685628e-01, -1.63860181e-01, -3.63931722e-01,
         5.00000000e+00],
       [-1.08376946e-01, -4.74595930e-01, -7.80179281e-02,
         6.00000000e+00],
       [ 4.88805866e-01,  7.75497416e-01, -1.53194553e-01,
         7.00000000e+00],
       [-6.72927601e-01,  3.62082135e-01,  2.15867455e-01,
         8.00000000e+00],
       [ 2.15309552e-01, -5.20016776e-02,  1.53374223e-01,
         9.00000000e+00],
       [ 4.27791053e-01,  4.84470585e-01,  2.35312892e-01,
         1.00000000e+01],
       [ 4.61134396e-01,  2.92079786e-01,  2.82792779e-02,
      

In [10]:
from sklearn.decomposition import PCA
import numpy as np

# PCA 모델 생성 및 적용
pca = PCA(n_components=2)
X_pca = pca.fit_transform(df.values)


In [12]:
df_pca = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])
df_pca

Unnamed: 0,PCA1,PCA2
0,-49.501713,0.294975
1,-48.499659,-0.013725
2,-47.501946,0.197713
3,-46.500511,0.507463
4,-45.499618,-0.126603
...,...,...
95,45.501430,-0.501374
96,46.500279,0.301621
97,47.501282,-0.228862
98,48.501594,-0.608374
