In [1]:
from pathlib import Path
import os
from src.modelling import MeanPoolingTextClassifier, TransformerTextClassifier
from src.preprocessing import TextPreprocessor
import pytorch_lightning as pl

import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
base_path = Path(os.getcwd()).parent
base_path

PosixPath('/workspace')

In [3]:
df = (
    pd.read_csv(base_path / "data" / "2025_data_to_explore.csv")
    .filter(['id', 'company_description', 'source', 'is_edited', 'created_at'])
    .dropna()
    .astype(
        {
            "id": "int64",
            "is_edited": "float64"
        }
    )
    .assign(created_at=lambda d: pd.to_datetime(d["created_at"]))
)
df.head()

Unnamed: 0,id,company_description,source,is_edited,created_at
0,370341,Cegid is a leading provider of cloud business ...,LinkedIn - Reported,0.0,2023-08-05 12:22:59
1,537105,Page 1 Printers is a service oriented printing...,LinkedIn - Reported,0.0,2023-10-31 02:53:58
2,581168,Unifrog is a whole-school careers and destinat...,LinkedIn - Reported,0.0,2023-11-01 12:34:34
3,367853,Créée au lendemain de la seconde guerre mondia...,LinkedIn - Reported,0.0,2023-08-05 09:37:34
4,353476,‘Niche Engineering From Concept To Supply’\ \ ...,LinkedIn - Reported,0.0,2023-08-04 14:30:31


In [4]:
import pandas as pd
from langdetect import detect, DetectorFactory

# Ensure consistent results
DetectorFactory.seed = 0

# Function to detect language safely
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

# Apply to the column
df['language'] = df['company_description'].apply(detect_language)

print(df)

          id                                company_description  \
0     370341  Cegid is a leading provider of cloud business ...   
1     537105  Page 1 Printers is a service oriented printing...   
2     581168  Unifrog is a whole-school careers and destinat...   
3     367853  Créée au lendemain de la seconde guerre mondia...   
4     353476  ‘Niche Engineering From Concept To Supply’\ \ ...   
...      ...                                                ...   
1095  641607  OnRamp is dynamic customer onboarding software...   
1096  422103  Spaulding Clinical CRO specializes in Phase I ...   
1097  366705  L’AVENIR DES ENFANTS SE JOUE MAINTENANT !\ Cha...   
1098  572679  LUM Transition crée en 2014 est la première en...   
1099  681017  Patent Boutique European and UK Patent Attorne...   

                   source  is_edited          created_at language  
0     LinkedIn - Reported        0.0 2023-08-05 12:22:59       en  
1     LinkedIn - Reported        0.0 2023-10-31 02:53:58   

In [5]:
df_filtered = df[df.language == "en"]
df_filtered.head()

Unnamed: 0,id,company_description,source,is_edited,created_at,language
0,370341,Cegid is a leading provider of cloud business ...,LinkedIn - Reported,0.0,2023-08-05 12:22:59,en
1,537105,Page 1 Printers is a service oriented printing...,LinkedIn - Reported,0.0,2023-10-31 02:53:58,en
2,581168,Unifrog is a whole-school careers and destinat...,LinkedIn - Reported,0.0,2023-11-01 12:34:34,en
4,353476,‘Niche Engineering From Concept To Supply’\ \ ...,LinkedIn - Reported,0.0,2023-08-04 14:30:31,en
5,420991,Control your data. Collect your money. Explora...,website,0.0,2023-10-23 18:55:50,en


In [6]:
# Initialize preprocessor
preprocessor = TextPreprocessor(use_spell_correction=False)

# Optional: build bigram model on all tokenized descriptions
token_lists = [preprocessor.tokenize(preprocessor.clean_text(desc)) for desc in df_filtered['company_description']]
preprocessor.build_bigrams(token_lists)

# Apply preprocessing
df_filtered = df_filtered.copy()
df_filtered['clean'] = df_filtered['company_description'].apply(preprocessor.preprocess)
print(df_filtered['clean'])

0       cegid lead provider cloud business management ...
1       page <NUM> printer service orient print compan...
2       unifrog whole school career destination platfo...
4       niche engineering concept supply found <NUM> b...
5       control datum collect money exploration group ...
                              ...                        
1093    ft pipeline system manufacture coat line steel...
1094    found <NUM> datakwip mission change way world ...
1095    onramp dynamic customer onboarding software he...
1096    spaulde clinical cro specializes phase clinica...
1099    patent boutique european uk patent attorney pr...
Name: clean, Length: 938, dtype: object


In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_filtered['target'] = le.fit_transform(df_filtered['source'])

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_filtered['clean'], df_filtered['target'], test_size=0.2, random_state=42, stratify=df_filtered['target']
)

In [9]:
from src.preprocessing import encode, create_vocab

vocab = create_vocab(X_train)
vocab_size = len(vocab) + 1

In [None]:
map(lambda x: encode(x, vocab), X_train)

TypeError: 'map' object is not subscriptable

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class CompanyDataset(Dataset):
    def __init__(self, documents, labels, max_len=50):
        self.documents = documents
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.documents)

    def __getitem__(self, idx):
        ids = self.documents[idx]
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

train_dataset = CompanyDataset(X_train.tolist(), y_train.tolist())
val_dataset = CompanyDataset(X_test.tolist(), y_test.tolist())
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
n_classes = len(le.classes_)

trainer = pl.Trainer(
    max_epochs=10,
    accelerator='cpu',
    devices=1  # must be an int > 0 for CPU
)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
# model = MeanPoolingTextClassifier(
#     vocab_size=vocab_size,
#     embed_dim=100,
#     n_classes=n_classes
# )
model = TransformerTextClassifier(
    vocab_size=vocab_size,
    embed_dim=100,
    n_classes=n_classes
)
trainer.fit(model, train_loader, val_loader)


  | Name      | Type      | Params | Mode 
------------------------------------------------
0 | embedding | Embedding | 520 K  | train
1 | fc1       | Linear    | 12.9 K | train
2 | fc2       | Linear    | 258    | train
------------------------------------------------
534 K     Trainable params
0         Non-trainable params
534 K     Total params
2.136     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Epoch 9: 100%|██████████| 24/24 [00:00<00:00, 87.67it/s, v_num=5, val_loss=0.504, val_acc=0.819] 

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 24/24 [00:00<00:00, 80.10it/s, v_num=5, val_loss=0.504, val_acc=0.819]
