In [1]:
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import torch
import json
import re
import wikipedia
import pandas as pd
from tqdm import tqdm
import torch.nn.functional as F

device = 'cuda:0'

### Build data

In [2]:
data = pd.read_csv('articles_films.csv', sep=',', on_bad_lines='skip').title.tolist()

In [4]:
for i,d in tqdm(enumerate(data)):
    try:
        text = wikipedia.page(d).content.replace('\n', ' ')
        text = re.sub(' +', ' ', text)
        with open('text_Film_theory.txt', 'a') as file:
            file.write(text + '\n')
    except:
        with open('fails.txt', 'a') as file:
            file.write(str(i) + '\n')




  lis = BeautifulSoup(html).find_all('li')
3406it [43:58,  1.29it/s]


### Build embeddings

In [2]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2', device_map=device)

In [3]:
with open('text_Fields_of_mathematics_Mathematical_concepts_Mathematical_theorems.txt', 'r') as file:
    data = file.read()

In [4]:
all_col = torch.zeros(13961, 384)

In [5]:
chunk_size = 1000
overlap = 300

df = pd.DataFrame(columns=['text'] + [f'dim_{d}' for d in range(384)])
i = overlap
row = 0
k = 0
while i-overlap+chunk_size < len(data):
    chunk = data[i-overlap: i-overlap+chunk_size]

    encoded_input = tokenizer(chunk, truncation=True, return_tensors='pt').to(device)

    with torch.no_grad():
        model_output = model(**encoded_input)

    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

    #df.loc[row] = [chunk] + sentence_embeddings.squeeze().cpu().tolist()

    all_col[k, :] = sentence_embeddings.squeeze().cpu()

    i += chunk_size
    row += 1
    k += 1

    print(i/len(data), row)

9.311405117533928e-05 1
0.00016474024438713873 2
0.00023636643759893816 3
0.0003079926308107376 4
0.000379618824022537 5
0.0004512450172343365 6
0.0005228712104461359 7
0.0005944974036579353 8
0.0006661235968697348 9
0.0007377497900815343 10
0.0008093759832933337 11
0.0008810021765051331 12
0.0009526283697169326 13
0.001024254562928732 14
0.0010958807561405315 15
0.001167506949352331 16
0.0012391331425641303 17
0.0013107593357759297 18
0.0013823855289877291 19
0.0014540117221995288 20
0.0015256379154113282 21
0.0015972641086231276 22
0.001668890301834927 23
0.0017405164950467264 24
0.0018121426882585258 25
0.0018837688814703254 26
0.0019553950746821246 27
0.002027021267893924 28
0.002098647461105724 29
0.0021702736543175232 30
0.0022418998475293227 31
0.002313526040741122 32
0.0023851522339529215 33
0.002456778427164721 34
0.0025284046203765203 35
0.0026000308135883197 36
0.002671657006800119 37
0.0027432832000119185 38
0.002814909393223718 39
0.0028865355864355173 40
0.002958161779647

In [6]:
for i in [127, 223, 319, 200]:
    print(f'mean_col_{i}=', all_col[:, i].mean().item())
    print(f'max_col_{i}=', all_col[:, i].max().item())
    print(f'std_col_{i}=', all_col[:, i].std().item())

print(f'mean_col_all=', all_col.mean().item())
print(f'max_col_all=', all_col.max().item())
print(f'std_col_all=', all_col.std().item())

mean_col_127= 4.9279375395599335e-33
max_col_127= 1.632949732388289e-32
std_col_127= 2.342806709949875e-33
mean_col_223= -5.2916751646386895e-33
max_col_223= 3.5809363589007665e-33
std_col_223= 1.8979636480842274e-33
mean_col_319= -5.2013259477234897e-08
max_col_319= -2.2664535137550956e-08
std_col_319= 6.7676584336595624e-09
mean_col_200= -0.0026264037005603313
max_col_200= 0.18293367326259613
std_col_200= 0.047588448971509933
mean_col_all= 0.0003472251701168716
max_col_all= 0.26208481192588806
std_col_all= 0.051029860973358154


In [20]:
print('mean: ', torch.tensor(mean) / len(data))
print('max: ', torch.tensor(maximum))

mean:  tensor([ 4.9278e-36, -5.2915e-36, -5.2012e-11, -2.6263e-06,  3.4722e-07])
max:  tensor([ 6.8799e-29, -7.3877e-29, -7.2616e-04, -3.6667e+01])


In [8]:
df.to_csv('embeddings_films.csv')

### to .txt

In [23]:
data = pd.read_csv('embeddings_maths.csv', usecols=[i for i in range(2, 386)])
data

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,dim_9,...,dim_374,dim_375,dim_376,dim_377,dim_378,dim_379,dim_380,dim_381,dim_382,dim_383
0,-0.051007,-0.012978,-0.004332,-0.011717,-0.017226,0.099527,0.046106,-0.002129,0.096107,0.092475,...,0.011735,0.050349,0.032294,-0.066350,0.010584,0.002566,-0.028208,0.046703,-0.013030,-0.037327
1,-0.124250,-0.026011,-0.018328,-0.008907,-0.012531,0.081328,0.048991,0.071464,0.052862,0.035286,...,0.014260,0.005386,0.029491,-0.113279,-0.078890,-0.044626,-0.066180,0.043084,0.005310,-0.023522
2,0.009710,-0.006745,0.026349,0.013770,-0.030031,0.093438,0.053371,0.022237,0.092181,0.145582,...,0.015877,0.037046,0.044612,-0.011689,0.017332,0.035664,-0.012586,0.068374,-0.049236,-0.009235
3,-0.007143,-0.026577,-0.001916,-0.003471,-0.006410,0.044439,0.045105,0.018795,0.065954,0.075059,...,0.004710,0.035422,0.066500,0.093002,0.037736,0.091792,-0.050400,0.031015,0.012145,-0.026191
4,-0.002157,-0.041576,0.032375,-0.013392,-0.013451,0.026802,0.040362,-0.001480,0.084631,0.032822,...,0.008679,0.062008,0.085315,-0.046990,0.055014,-0.004234,-0.084269,0.010322,-0.044174,-0.078543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13956,-0.091739,-0.020830,0.000524,-0.056864,0.035235,-0.025001,0.001402,0.119695,0.092264,-0.005016,...,0.001604,-0.045562,0.009117,-0.064556,0.080322,0.061112,-0.021374,0.057491,-0.010599,-0.081102
13957,-0.128038,-0.015816,0.020297,0.031980,0.039334,0.010209,0.084143,0.097361,0.117580,-0.002334,...,-0.051010,0.057692,0.064038,-0.012729,-0.067720,0.033084,-0.017994,0.070195,0.011426,-0.015970
13958,-0.074109,-0.098621,0.058788,-0.000985,0.013306,0.068014,0.118665,0.046229,0.063646,-0.010651,...,-0.051419,0.058549,0.039828,-0.018397,-0.046813,0.004026,0.058502,-0.022283,-0.040176,0.019134
13959,-0.062486,-0.024779,0.041091,-0.021990,0.025752,0.019201,0.073873,0.054727,0.058572,-0.032722,...,-0.039516,-0.035085,-0.006046,0.010948,-0.037108,-0.050757,0.056676,-0.067603,-0.020334,-0.022592


In [24]:
data.to_csv('embeddings_maths.txt', sep=' ', float_format='%.6f', header=False, index=False)

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.documents.base import Document
from langchain.vectorstores import FAISS
from gnews import GNews
from tqdm import tqdm
import torch
import json

device = 'cuda:0'

In [10]:
embedding_model = "sentence-transformers/all-MiniLM-l6-v2"

embeddings = HuggingFaceEmbeddings(
                            model_name=embedding_model,
                            model_kwargs={'device': device},
                            )

05/06/2024 09:56:19 AM - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-l6-v2


In [11]:
emb = embeddings.embed_query("Why was Sam Altman fired from OpenAI in November 2023?")