In [56]:
### Task 1 - prototyping

In [52]:
import csv
import ast
import time
from datetime import datetime

# Helper to print with timestamp
def log(message):
    print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")

# Load tags and keywords
def load_tags(path):
    log(f"Loading tags from '{path}'...")
    tags = []
    with open(path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            try:
                keyword_list = ast.literal_eval(row['keywords'])
                tags.append({
                    'id': row['id'],
                    'name': row['name'],
                    'keywords': set(kw.lower() for kw in keyword_list)  # set for faster lookup
                })
            except Exception as e:
                log(f"Error parsing row {row['id']}: {e}")
    log(f"Loaded {len(tags)} tags.")
    return tags

# Load sentences from file
def load_sentences(path):
    log(f"Loading sentences from '{path}'...")
    with open(path, 'r', encoding='utf-8') as f:
        sentences = [line.strip() for line in f if line.strip()]
    log(f"Loaded {len(sentences)} sentences.")
    return sentences

# Match tags to sentences
def tag_sentences(sentences, tags):
    log("Tagging sentences...")
    start_time = time.time()
    results = []
    for i, sentence in enumerate(sentences, start=1):
        sentence_lower = sentence.lower()
        matched = [
            tag['name'] for tag in tags
            if any(keyword in sentence_lower for keyword in tag['keywords'])
        ]
        results.append({'sentence': sentence, 'tags': ', '.join(matched)})
        if i % 100 == 0:
            log(f"Processed {i} sentences...")
    log(f"Tagging completed. Total: {len(results)}. Time: {time.time() - start_time:.2f} seconds.")
    return results

# Save to TSV
def save_results(results, output_path):
    log(f"Saving results to '{output_path}'...")
    with_tag = 0
    without_tag = 0
    
    for result in results:
        if result['tags']:  # if tags string is not empty
            with_tag += 1
        else:
            without_tag += 1
    
    log(f"Sentences with tags: {with_tag}")
    log(f"Sentences without tags: {without_tag}")
    log(f"Total sentences processed: {len(results)}")
    
    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['sentence', 'tags'], delimiter='\t')
        writer.writerows(results)
    log("Results saved successfully.")

# Main pipeline
def main():
    start = time.time()
    tags = load_tags('data/tags.csv')
    sentences = load_sentences('data/sentences.txt')
    results = tag_sentences(sentences, tags)
    save_results(results, 'output/task_1_output.tsv')
    print(results[:10])
    log(f"Finished entire process in {time.time() - start:.2f} seconds.")

if __name__ == '__main__':
    main()


[10:11:25] Loading tags from 'data/tags.csv'...
[10:11:25] Loaded 27 tags.
[10:11:25] Loading sentences from 'data/sentences.txt'...
[10:11:25] Loaded 2997 sentences.
[10:11:25] Tagging sentences...
[10:11:25] Processed 100 sentences...
[10:11:25] Processed 200 sentences...
[10:11:25] Processed 300 sentences...
[10:11:25] Processed 400 sentences...
[10:11:25] Processed 500 sentences...
[10:11:25] Processed 600 sentences...
[10:11:25] Processed 700 sentences...
[10:11:25] Processed 800 sentences...
[10:11:25] Processed 900 sentences...
[10:11:25] Processed 1000 sentences...
[10:11:25] Processed 1100 sentences...
[10:11:25] Processed 1200 sentences...
[10:11:25] Processed 1300 sentences...
[10:11:25] Processed 1400 sentences...
[10:11:25] Processed 1500 sentences...
[10:11:25] Processed 1600 sentences...
[10:11:25] Processed 1700 sentences...
[10:11:25] Processed 1800 sentences...
[10:11:25] Processed 1900 sentences...
[10:11:25] Processed 2000 sentences...
[10:11:25] Processed 2100 sent

In [None]:
# TASK 2 - Prototyping - Not satisfied with solution !!!

In [49]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List
import os

def load_data(tags_path: str, sentences_path: str) -> (pd.DataFrame, List[str]):
    """Loads tag data and test sentences from file."""
    try:
        task_df = pd.read_csv(tags_path)
        task_df['keywords'] = task_df['keywords'].apply(eval)
    except Exception as e:
        raise FileNotFoundError(f"Error loading tags: {e}")

    try:
        with open(sentences_path, 'r', encoding='utf-8') as f:
            sentences = [line.strip() for line in f if line.strip()]
    except Exception as e:
        raise FileNotFoundError(f"Error loading sentences: {e}")

    return task_df, sentences

def build_training_data(task_df: pd.DataFrame) -> pd.DataFrame:
    """Builds training dataframe from keywords and tags."""
    pairs = [(kw.lower(), row['name']) for _, row in task_df.iterrows() for kw in row['keywords']]
    train_df = pd.DataFrame(pairs, columns=['sentence', 'tag'])
    return train_df.groupby('sentence')['tag'].apply(list).reset_index()

def encode_labels(tags: pd.Series) -> (MultiLabelBinarizer, np.ndarray):
    """Encodes multilabel tags."""
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(tags)
    return mlb, y

def generate_embeddings(model_name: str, sentences: List[str]) -> np.ndarray:
    """Generates sentence embeddings using a transformer model."""
    print(f"üîÑ Generating embeddings using model: {model_name}...")
    model = SentenceTransformer(model_name)
    embeddings = model.encode(sentences, show_progress_bar=True)
    return embeddings

def train_model(X: np.ndarray, y: np.ndarray) -> OneVsRestClassifier:
    """Trains a One-vs-Rest Random Forest classifier."""
    print("üß† Training classifier...")
    clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42))
    clf.fit(X, y)
    return clf

def predict_tags(clf, X_test: np.ndarray, mlb: MultiLabelBinarizer, threshold: float) -> List[List[str]]:
    """Predicts tags for test data using probability thresholding."""
    print(f"üîç Predicting tags with threshold: {threshold}")
    y_pred_prob = clf.predict_proba(X_test)
    y_pred = (y_pred_prob >= threshold).astype(int)
    return mlb.inverse_transform(y_pred)

def save_predictions(sentences: List[str], predictions: List[List[str]], output_path: str):
    """Saves predictions to a TSV file."""
    output_lines = []
    predicted_count = 0

    for sentence, tags in zip(sentences, predictions):
        line = f"{sentence}\t{', '.join(tags)}" if tags else f"{sentence}\t"
        output_lines.append(line)
        if tags:
            predicted_count += 1

    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(output_lines))

    print("\nüîç Preview of first 10 predictions:")
    for line in output_lines[:10]:
        print(line)

    print(f"\n‚úÖ Total sentences with predictions: {predicted_count}")
    print(f"üíæ Predictions saved to: {output_path}")

def main(threshold: float = 0.2):
    tags_path = 'data/tags.csv'
    sentences_path = 'data/sentences.txt'
    output_path = 'output/task_2_output.tsv'
    model_name = 'paraphrase-MiniLM-L6-v2'

    task_df, sentences = load_data(tags_path, sentences_path)
    grouped_train = build_training_data(task_df)
    
    mlb, y_train = encode_labels(grouped_train['tag'])
    X_train = generate_embeddings(model_name, grouped_train['sentence'].tolist())
    X_test = generate_embeddings(model_name, sentences)
    
    clf = train_model(X_train, y_train)
    predicted_tags = predict_tags(clf, X_test, mlb, threshold)
    
    save_predictions(sentences, predicted_tags, output_path)

if __name__ == "__main__":
    main(threshold=0.5)  # Adjust threshold here if needed


üîÑ Generating embeddings using model: paraphrase-MiniLM-L6-v2...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

üîÑ Generating embeddings using model: paraphrase-MiniLM-L6-v2...


Batches:   0%|          | 0/94 [00:00<?, ?it/s]

üß† Training classifier...
üîç Predicting tags with threshold: 0.5

üîç Preview of first 10 predictions:
Get a loan	
testing	
Pay off car	
Hi there! I am in the process of switching banks because my Husband and I are joining accounts. We are both switching to a join account through SoFi. How should I go about transferring my current checking and savings account?	
bank phone number	
Hi there! I need some help with‚Ä¶login	
i put a application¬† in for a checking account on your website and was wondering the staus on that	
Hi, I just had a question on what would be the best course of action for a large withdraw or payment. I am getting a new car in a few weeks and plan on making a large down payment. Would something like that be best to write a cashier‚Äôs check or can I simply have them take it out of checking? I‚Äôll of course have to move the amount from savings to checking first. Thanks!	
Hi there! I need some help with‚Ä¶ my debit card I forgot the pin	
I‚Äôm not seeing my accoun

In [None]:
# TASK 2. Prototyping

In [51]:
import pandas as pd
import numpy as np
import os
from typing import List, Tuple
from sklearn.preprocessing import MultiLabelBinarizer
from sentence_transformers import SentenceTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

def load_data(tags_path: str, sentences_path: str) -> Tuple[pd.DataFrame, List[str]]:
    task_df = pd.read_csv(tags_path)
    task_df['keywords'] = task_df['keywords'].apply(eval)
    with open(sentences_path, 'r', encoding='utf-8') as f:
        sentences = [line.strip() for line in f if line.strip()]
    return task_df, sentences

def build_training_data(task_df: pd.DataFrame) -> pd.DataFrame:
    pairs = [(kw.lower(), row['name']) for _, row in task_df.iterrows() for kw in row['keywords']]
    train_df = pd.DataFrame(pairs, columns=['sentence', 'tag'])
    return train_df.groupby('sentence')['tag'].apply(list).reset_index()

def encode_labels(tags: pd.Series) -> Tuple[MultiLabelBinarizer, np.ndarray]:
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(tags)
    return mlb, y

def generate_embeddings(model_name: str, sentences: List[str]) -> np.ndarray:
    print(f"üîÑ Generating embeddings with: {model_name}...")
    model = SentenceTransformer(model_name)
    return model.encode(sentences, show_progress_bar=True)

def build_nn_model(input_dim: int, output_dim: int) -> Sequential:
    model = Sequential([
        Dense(512, activation='relu', input_shape=(input_dim,)),
        Dropout(0.3),
        Dense(256, activation='relu'),
        Dropout(0.2),
        Dense(output_dim, activation='sigmoid')  # sigmoid for multilabel
    ])
    model.compile(optimizer=Adam(learning_rate=1e-4),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

def train_model_nn(X: np.ndarray, y: np.ndarray) -> Sequential:
    model = build_nn_model(X.shape[1], y.shape[1])
    print("üß† Training neural network...")
    model.fit(X, y, epochs=50, batch_size=32, validation_split=0.1,
              callbacks=[EarlyStopping(monitor='val_loss', patience=5)], verbose=1)
    return model

def predict_tags_nn(model, X_test: np.ndarray, mlb: MultiLabelBinarizer, threshold: float) -> List[List[str]]:
    print(f"üîç Predicting with threshold {threshold}...")
    y_pred_prob = model.predict(X_test)
    y_pred = (y_pred_prob >= threshold).astype(int)
    return mlb.inverse_transform(y_pred)

def save_predictions(sentences: List[str], predictions: List[List[str]], output_path: str):
    output_lines = []
    predicted_count = 0
    for sentence, tags in zip(sentences, predictions):
        line = f"{sentence}\t{', '.join(tags)}" if tags else f"{sentence}\t"
        output_lines.append(line)
        if tags:
            predicted_count += 1
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(output_lines))
    print("\nüîç Preview of first 10 predictions:")
    for line in output_lines[:10]:
        print(line)
    print(f"\n‚úÖ Total sentences with predictions: {predicted_count}")
    print(f"üíæ Predictions saved to: {output_path}")

def main(threshold: float = 0.2):
    tags_path = 'data/tags.csv'
    sentences_path = 'data/sentences.txt'
    output_path = 'output/task_2_output.tsv'
    model_name = 'paraphrase-MiniLM-L6-v2'

    task_df, sentences = load_data(tags_path, sentences_path)
    grouped_train = build_training_data(task_df)

    mlb, y_train = encode_labels(grouped_train['tag'])
    X_train = generate_embeddings(model_name, grouped_train['sentence'].tolist())
    X_test = generate_embeddings(model_name, sentences)

    model = train_model_nn(X_train, y_train)
    predicted_tags = predict_tags_nn(model, X_test, mlb, threshold)

    save_predictions(sentences, predicted_tags, output_path)

if __name__ == "__main__":
    main(threshold=0.3)  # Lower threshold for broader predictions


üîÑ Generating embeddings with: paraphrase-MiniLM-L6-v2...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

üîÑ Generating embeddings with: paraphrase-MiniLM-L6-v2...


Batches:   0%|          | 0/94 [00:00<?, ?it/s]

üß† Training neural network...
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m17/17[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.0312 - loss: 0.7012 - val_accuracy: 0.0492 - val_loss: 0.5952
Epoch 2/50
[1m17/17[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0314 - loss: 0.5686 - val_accuracy: 0.0492 - val_loss: 0.4751
Epoch 3/50
[1m17/17[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0299 - loss: 0.4453 - val_accuracy: 0.0492 - val_loss: 0.3565
Epoch 4/50
[1m17/17[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0451 - loss: 0.3286 - val_accuracy: 0.0492 - val_loss: 0.2594
Epoch 5/50
[1m17/17[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0454 - loss: 0.2464 - val_accuracy: 0.0656 - val_loss: 0.2029
Epoch 6/50