# Install

In [None]:
%pip install openai==0.28.0 

In [None]:
!pip install s3fs


In [None]:
pip install xgboost

In [None]:
import pandas as pd

s3_path = 's3://ml-translation-input-dev/non_english_sample_100k.csv'

df = pd.read_csv(s3_path)

df.head()


In [None]:
df.shape

# Processing Embeddings in Batches

### Overview
- **Model**: Generates embeddings using OpenAI's `text-embedding-ada-002` model.
- **Batching**: Processes data in batches for efficiency.
- **Output**: Saves embeddings to an S3 bucket.
- **Progress Tracking**: Utilizes a checkpoint file to allow resuming if interrupted.


In [None]:
import openai
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import json
import s3fs
import os


openai.api_key = ""

batch_size = 1000
EMBEDDING_DIM = 1536
ZERO_EMBEDDING = [0.0] * EMBEDDING_DIM

def create_embeddings_batch(texts, max_retries=3):
    embeddings = []
    non_empty_indices = [i for i, text in enumerate(texts) if isinstance(text, str) and text.strip()]
    non_empty_texts = [text for text in texts if isinstance(text, str) and text.strip()]
    for attempt in range(max_retries):
        try:
            if non_empty_texts:
                response = openai.Embedding.create(
                    input=non_empty_texts,
                    model="text-embedding-ada-002"
                )
                batch_embeddings = [item['embedding'] for item in response['data']]
                if len(batch_embeddings) != len(non_empty_texts):
                    batch_embeddings = [ZERO_EMBEDDING] * len(non_empty_texts)
                break
        except Exception as e:
            print(f"Error en el intento {attempt + 1}: {e}")
            time.sleep(2)
    else:
        batch_embeddings = [ZERO_EMBEDDING] * len(non_empty_texts)

    full_embeddings = [ZERO_EMBEDDING] * len(texts)
    for idx, emb in zip(non_empty_indices, batch_embeddings):
        if len(emb) == EMBEDDING_DIM:
            full_embeddings[idx] = emb
        else:
            full_embeddings[idx] = ZERO_EMBEDDING
    return full_embeddings

fs = s3fs.S3FileSystem()

with fs.open(input_s3_path, 'r') as f:
    df = pd.read_csv(f)

if 'embedding' not in df.columns:
    df['embedding'] = None

if fs.exists(checkpoint_file):
    with fs.open(checkpoint_file, 'r') as f_chk:
        processed_batches = json.load(f_chk)
else:
    processed_batches = []
total_batches = len(df) // batch_size + 1

for i in tqdm(range(0, len(df), batch_size), desc="Processing Batchs"):
    if i in processed_batches:
        continue  

    batch_df = df.iloc[i:i + batch_size]
    batch_texts = batch_df['JOB_TITLE'].astype(str).tolist() 
    batch_embeddings = create_embeddings_batch(batch_texts)

    batch_df['embedding'] = [json.dumps(emb) for emb in batch_embeddings]

    batch_number = i // batch_size
    batch_output_path = os.path.join(output_s3_dir, f'embeddings_batch_{batch_number}.csv')
    with fs.open(batch_output_path, 'w') as f_out:
        batch_df[['JOB_TITLE', 'embedding']].to_csv(f_out, index=False)

    processed_batches.append(i)
    with fs.open(checkpoint_file, 'w') as f_chk:
        json.dump(processed_batches, f_chk)
    time.sleep(1)

print("Processing of embeddings completed.")

## Unifying Batches into a Single CSV

### Overview
- **Consolidation**: Combines all batch files with embeddings into one unified CSV file.
- **Validation**: Verifies the validity of embeddings during the process.
- **Local Merge**: Merges data locally before final upload.
- **Output**: Uploads the consolidated file to an S3 bucket.


In [None]:
import pandas as pd
import s3fs
import json
from tqdm import tqdm
import os

fs = s3fs.S3FileSystem()

output_s3_dir = 's3://ml-translation-input-dev/embeddings_batches/'
final_output_s3_path = 's3://ml-translation-input-dev/job_titles_with_embeddings.csv'
local_final_path = '/tmp/job_titles_with_embeddings.csv'
def verificar_embedding(embedding_str, dim=1536):
    try:
        embedding = json.loads(embedding_str)
        return isinstance(embedding, list) and len(embedding) == dim
    except:
        return False

batch_files = fs.ls(output_s3_dir)

header_written = False

with open(local_final_path, 'w', encoding='utf-8') as f_final:
    for file in tqdm(batch_files, desc="Unificando Batches"):
        if file.endswith('.csv'):
            with fs.open(file, 'r') as f_batch:
                df_batch = pd.read_csv(f_batch)
            
            df_batch['valid_embedding'] = df_batch['embedding'].apply(verificar_embedding)
            
            if not df_batch['valid_embedding'].all():
                num_incorrectos = (~df_batch['valid_embedding']).sum()
                print(f"bath {file} has {num_incorrectos} embeddings wrong.")
            
            df_batch.drop(columns=['valid_embedding'], inplace=True)
            
            df_batch.to_csv(f_final, index=False, header=not header_written, mode='w' if not header_written else 'a', encoding='utf-8')
            
            if not header_written:
                header_written = True

with open(local_final_path, 'rb') as f_final:
    fs.put(local_final_path, final_output_s3_path)

print("done.")



## Verifying Correct Length of Embeddings

### Overview
- **Validation**: Ensures that all embeddings have the expected dimensionality.
- **Error Handling**: Identifies and logs any embeddings with incorrect lengths.


In [6]:
import pandas as pd
import s3fs
import json

fs = s3fs.S3FileSystem()

final_output_s3_path = 's3://ml-translation-input-dev/job_titles_with_embeddings.csv'

chunksize = 10000 

def verificar_embedding(embedding_str, dim=1536):
    try:
        embedding = json.loads(embedding_str)
        return isinstance(embedding, list) and len(embedding) == dim
    except:
        return False

total_embeddings = 0
correct_embeddings = 0
incorrect_embeddings = 0

for chunk in pd.read_csv(fs.open(final_output_s3_path, 'r'), chunksize=chunksize):
chunk = pd.read_csv('data_job_title_industry_embeddings.csv')
chunk['valid_embedding'] = chunk['embedding'].apply(verificar_embedding)
correct = chunk['valid_embedding'].sum()
incorrect = (~chunk['valid_embedding']).sum()
total_embeddings += len(chunk)
correct_embeddings += correct
incorrect_embeddings += incorrect

print(f"Total embeddings processed: {total_embeddings}")
print(f"Embeddings corrects: {correct_embeddings}")
print(f"Embeddings incorrects: {incorrect_embeddings}")


Total embeddings processed: 1000
Embeddings corrects: 1000
Embeddings incorrects: 0


## Adjusting Cluster Model

### Overview
- **Purpose**: Incrementally trains a MiniBatchKMeans clustering model on job title embeddings.
- **Process**:
  - Reads embeddings in chunks from an S3 file.
  - Converts and processes embedding data into a numerical matrix.
  - Updates the clustering model iteratively using `partial_fit`.
- **Output**:
  - Saves the trained clustering model locally.
  - Uploads the model file to an S3 bucket for future use.


In [None]:


fs = s3fs.S3FileSystem()

final_output_s3_path = 's3://ml-translation-input-dev/job_titles_with_embeddings.csv'

chunksize = 5000 

num_clusters = 1000
kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42, batch_size=1000)

print("Adjusting the clustering model with embeddings...")
for chunk in tqdm(pd.read_csv(fs.open(final_output_s3_path, 'r'), chunksize=chunksize)):
    embeddings = chunk['embedding'].apply(lambda x: json.loads(x) if isinstance(x, str) else [0.0]*1536)
    embeddings_matrix = np.vstack(embeddings.values).astype(np.float32)
    kmeans.partial_fit(embeddings_matrix)

model_path = 's3://ml-translation-input-dev/minibatch_kmeans_model.pkl'
with fs.open(model_path, 'wb') as f_model:
    joblib.dump(kmeans, f_model)

print("Clustering model adjusted and stored in S3.")



## Assign Clusters and Sampling

### Overview
- **Purpose**: Assigns cluster labels to each embedding and performs sampling from the clustered data.
- **Process**:
  - Loads the trained MiniBatchKMeans model from S3.
  - Reads embeddings in chunks, assigns cluster labels using the model, and appends results.
  - Samples a specified number of data points (up to 30) from each cluster.
- **Output**:
  - Saves a sampled dataset with job titles, embeddings, and cluster labels.
  - Uploads the final sampled file to an S3 bucket.


In [None]:


fs = s3fs.S3FileSystem()

final_output_s3_path = 's3://ml-translation-input-dev/job_titles_with_embeddings.csv'
model_path = 's3://ml-translation-input-dev/minibatch_kmeans_model.pkl'
s3_output_path = 's3://ml-translation-input-dev/job_titles_embeddings_cluster_15000.csv'

chunksize = 5000  

with fs.open(model_path, 'rb') as f_model:
    kmeans = joblib.load(f_model)

print("Assigning clusters to each embedding...")
sampled_data = []

for chunk in tqdm(pd.read_csv(fs.open(final_output_s3_path, 'r'), chunksize=chunksize)):
    embeddings = chunk['embedding'].apply(lambda x: json.loads(x) if isinstance(x, str) else [0.0]*1536)
    embeddings_matrix = np.vstack(embeddings.values).astype(np.float32)
    
    clusters = kmeans.predict(embeddings_matrix)
    chunk['Cluster'] = clusters
    
    sampled_data.append(chunk)

df_with_clusters = pd.concat(sampled_data, ignore_index=True)

print("Sampling from each cluster...")
samples_per_cluster = 30
sampled_df = df_with_clusters.groupby('Cluster').apply(
    lambda x: x.sample(n=samples_per_cluster, random_state=42) if len(x) >= samples_per_cluster else x
).reset_index(drop=True)

sampled_df = sampled_df[['JOB_TITLE', 'embedding', 'Cluster']].head(15000)

with fs.open(s3_output_path, 'w') as f:
    sampled_df.to_csv(f, index=False)

print("Clustering and sampling process completed. File saved in S3.")


In [None]:
df_with_clusters.shape

## Final Verification of Sampled Embeddings

### Overview
- **Purpose**: Ensures the integrity of the sampled embeddings by verifying their dimensions.
- **Process**:
  - Reads the sampled dataset from S3.
  - Calculates the length of each embedding to check for consistency.
  - Outputs the frequency distribution of embedding lengths.
- **Output**:
  - Displays a summary of embedding length frequencies for validation.


In [None]:

fs = s3fs.S3FileSystem()

sampled_s3_path = 's3://ml-translation-input-dev/job_titles_sampled10000.csv' 

df_sampled = pd.read_csv(fs.open(sampled_s3_path, 'r'))

df_sampled['embedding_length'] = df_sampled['embedding'].apply(lambda x: len(json.loads(x)) if isinstance(x, str) else 0)
length_frequencies = df_sampled['embedding_length'].value_counts()
print(length_frequencies)


## Conclusion

### Summary
- Successfully processed job title data to generate embeddings, cluster them, and sample from the clustered data.
- Utilized OpenAI's `text-embedding-ada-002` model for embedding generation.
- Applied MiniBatchKMeans for clustering and verified the quality of embeddings throughout the process.
- Produced a final sampled dataset with cluster assignments, ensuring data integrity.

### Outcomes
- Generated embeddings stored in S3.
- Trained and saved a clustering model for future use.
- Created a balanced, sampled dataset for downstream applications.

### Next Steps
- Analyze the clusters for insights or patterns.
- Leverage the sampled dataset for model training or other analyses.
- Refine the process further if additional optimizations are needed.
