In [8]:
!cd fastText-0.9.2 && ./fasttext


usage: fasttext <command> <args>

The commands supported by fasttext are:

  supervised              train a supervised classifier
  quantize                quantize a model to reduce the memory usage
  test                    evaluate a supervised classifier
  test-label              print labels with precision and recall scores
  predict                 predict most likely labels
  predict-prob            predict most likely labels with probabilities
  skipgram                train a skipgram model
  cbow                    train a cbow model
  print-word-vectors      print word vectors given a trained model
  print-sentence-vectors  print sentence vectors given a trained model
  print-ngrams            print ngrams given a trained model and word
  nn                      query for nearest neighbors
  analogies               query for analogies
  dump                    dump arguments,dictionary,input/output vectors



In [2]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import fasttext

[Link to the dataset](https://huggingface.co/datasets/eriktks/conll2003)

In [4]:
ds = load_dataset("tomaarsen/conll2003")

{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

In [5]:
ds['train'][0]

{'id': '0',
 'document_id': 1,
 'sentence_id': 0,
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [6]:
def extract_ids_tokens_ner_tags(data_dict):
    """
    Extract 'sentence_id', 'tokens', and 'ner_tags' from a dictionary.
    
    Args:
    data_dict (dict): Input dictionary containing various fields including 'id', 'tokens', and 'ner_tags'.
    
    Returns:
    dict: A dictionary with extracted 'sentence_id', 'tokens', and 'ner_tags'.
    """
    return {
        'id': data_dict.get('sentence_id'),
        'tokens': data_dict.get('tokens'),
        'ner_tags': data_dict.get('ner_tags')
    }

In [7]:
def process_dataset(dataset):
    """
    Apply the extract_ids_tokens_ner_tags function to each dictionary in the dataset using the .map() method.
    
    Args:
    dataset (Dataset): A Hugging Face Dataset.
    
    Returns:
    Dataset: A processed Hugging Face Dataset with extracted 'id', 'tokens', and 'ner_tags'.
    """
    return dataset.map(extract_ids_tokens_ner_tags, remove_columns=dataset.column_names)

In [8]:
train_dataset=ds['train']
test_dataset=ds['test']

In [9]:
processed_train_dataset = process_dataset(train_dataset)
processed_test_dataset = process_dataset(test_dataset)

# Show the first processed example
print(processed_train_dataset[0])
print(processed_test_dataset[0])

{'id': 0, 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}
{'id': 0, 'tokens': ['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.'], 'ner_tags': [0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]}


In [26]:
processed_train_dataset.to_csv('train.csv')
processed_train_dataset.to_csv('test.csv')

Creating CSV from Arrow format: 100%|██████████| 15/15 [00:01<00:00, 10.80ba/s]
Creating CSV from Arrow format: 100%|██████████| 15/15 [00:01<00:00, 10.19ba/s]


2036071

In [29]:
def expand_tokens_ner(example):
    tokens = example['tokens']
    ner_tags = example['ner_tags']
    
    # Create a new list of dictionaries with token and corresponding ner_tag
    expanded_data = [{'token': token, 'ner_tag': ner_tag} for token, ner_tag in zip(tokens, ner_tags)]
    
    # Return the expanded data in a format suitable for Dataset
    return {'tokens': [item['token'] for item in expanded_data], 
            'ner_tags': [item['ner_tag'] for item in expanded_data]}

# Apply the map function to expand the dataset
expanded_dataset = processed_train_dataset.map(expand_tokens_ner, batched=False)

Map: 100%|██████████| 14041/14041 [00:01<00:00, 12695.60 examples/s]


In [30]:
expanded_dataset.to_csv('train.csv')

Creating CSV from Arrow format: 100%|██████████| 15/15 [00:01<00:00, 11.70ba/s]


2036071

In [None]:
import pandas as pd

# Load your dataset (assumed to be in CSV format)
data = pd.read_csv("train1.csv")

# Define your multilabel classes
label_columns = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG',"B-LOC","I-LOC", 'B-MISC']
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

# Convert the labels from '0|0|0|...' format to FastText format '__label__toxic __label__obscene'
def convert_labels(row):
    labels = [f"__label__{label}" for label, val in zip(label_columns, row) if val == 1]
    return ' '.join(labels)

# Apply the conversion and create the FastText formatted training data
data['labels'] = data[label_columns].apply(convert_labels, axis=1)

# Prepare the data in FastText format
with open("train.txt", "w") as f:
    for _, row in data.iterrows():
        labels = row['labels']
        text = row['comment_text'].replace("\n", " ")  # Remove newlines from the text
        if labels:
            f.write(f"{labels} {text}\n")


In [None]:
!awk -F"\t" '{print"__label__"$2" "$3}' < train.csv | shuf > all.txt


In [16]:
from fasttext import FastText

In [13]:
import pandas as pd
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

In [14]:
train_df.head(5)

Unnamed: 0,id,tokens,ner_tags
0,0,['EU' 'rejects' 'German' 'call' 'to' 'boycott'...,[3 0 7 0 0 0 7 0 0]
1,1,['Peter' 'Blackburn'],[1 2]
2,2,['BRUSSELS' '1996-08-22'],[5 0]
3,3,['The' 'European' 'Commission' 'said' 'on' 'Th...,[0 3 4 0 0 0 0 0 0 7 0 0 0 0 0 7 0 0 0 0 0 0 0...
4,4,"['Germany' ""'s"" 'representative' 'to' 'the' 'E...",[5 0 0 0 0 3 4 0 0 0 1 2 0 0 0 0 0 0 0 0 0 0 0...


In [19]:
tokens=[token for token in train_df['tokens']]

In [21]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.4-py3-none-any.whl.metadata (23 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylin

In [26]:
! pip install scipy



In [30]:
import scipy
import scipy.stats

ModuleNotFoundError: No module named 'scipy.stats'

In [32]:
# Load pretrained FastText model
fasttext_model = fasttext.load_model('cc.en.300.bin')

ValueError: cc.en.300.bin cannot be opened for loading!

In [33]:
model = fasttext.train_supervised(input="tokens")

ValueError: tokens cannot be opened for training!