# Creating model with embedding-based feature extraction approach using subword embedding technique for OOV cases

Instead of omitting words that do not appear in our word embeddings, this time we are going to create an embedding representation for each word as a sum of the representation of individual characters.

## Loading and Exploring dataset

In [1]:
pip install wget fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.10.4-py3-none-any.whl (222 kB)
Building wheels for collected packages: wget, fasttext
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9657 sha256=df14d8a89eb72ea822ce3e44ef155e88bc93a55fe006ecc3e9d37dfe5d43a434
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: f

In [2]:
import os
import pandas as pd
import wget
import tarfile
import requests

SEED=42

In [3]:
url = "https://github.com/pyk/dbpedia_csv/raw/master/dbpedia_csv.tar.gz"
filename = "dbpedia_csv.tar.gz"
save_folder = "data"

# Create the folder if it doesn't exist
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

save_path = os.path.join(save_folder, filename)

response = requests.get(url)
with open(save_path, "wb") as file:
    file.write(response.content)

print("File downloaded successfully and saved in the 'data' folder.")

File downloaded successfully and saved in the 'data' folder.


In [4]:
# untaring the required file
tar = tarfile.open(save_path, "r:gz")
tar.extractall(save_folder)     
tar.close()

In [5]:
# loading train data
train_file_path=os.path.join(save_folder,'dbpedia_csv/train.csv')
df = pd.read_csv(train_file_path, header=None, names=['class', 'name', 'description'])

# loading test data
test_file_path=os.path.join(save_folder, 'dbpedia_csv/test.csv')
df_test = pd.read_csv(test_file_path, header=None, names=['class','name','description'])

print("Train:{} Test:{}".format(df.shape,df_test.shape))

Train:(560000, 3) Test:(70000, 3)


In [6]:
# The dataset only include class numeric values, and we do not have any reference about its meanings
# Then, we are going to map those class numbers with class names (real labels not present in dataset)

class_dict={
  1:'Company',
  2:'EducationalInstitution',
  3:'Artist',
  4:'Athlete',
  5:'OfficeHolder',
  6:'MeanOfTransportation',
  7:'Building',
  8:'NaturalPlace',
  9:'Village',
  10:'Animal',
  11:'Plant',
  12:'Album',
  13:'Film',
  14:'WrittenWork'
}

# Mapping the classes
df['class_name'] = df['class'].map(class_dict)

In [7]:
df[:5]

Unnamed: 0,class,name,description,class_name
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...,Company
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...,Company
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...,Company
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...,Company
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...,Company


In [8]:
df['class_name'].unique()

array(['Company', 'EducationalInstitution', 'Artist', 'Athlete',
       'OfficeHolder', 'MeanOfTransportation', 'Building', 'NaturalPlace',
       'Village', 'Animal', 'Plant', 'Album', 'Film', 'WrittenWork'],
      dtype=object)

In [9]:
df['class_name'].value_counts()

Company                   40000
EducationalInstitution    40000
Artist                    40000
Athlete                   40000
OfficeHolder              40000
MeanOfTransportation      40000
Building                  40000
NaturalPlace              40000
Village                   40000
Animal                    40000
Plant                     40000
Album                     40000
Film                      40000
WrittenWork               40000
Name: class_name, dtype: int64

In [10]:
# exploring description cells
print(df[:5]['description'])
print('\n')
df[:5]['description'][0]

0     Abbott of Farnham E D Abbott Limited was a Br...
1     Schwan-STABILO is a German maker of pens for ...
2     Q-workshop is a Polish company located in Poz...
3     Marvell Software Solutions Israel known as RA...
4     Bergan Mercy Medical Center is a hospital loc...
Name: description, dtype: object




' Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.'

In [11]:
# perform data cleaning

def clean_text(text, normalize=True):
  # the aim of this replacement chain is to isolate special characters from other words,
  # so they can be processed as independent entities
  s = str(text).replace(',',' ').replace('"','').replace('\'',' \' ').\
              replace('.',' . ').replace('(',' ( ').replace(')',' ) ').\
              replace('!',' ! ').replace('?',' ? ').replace(':',' ').\
              replace(';',' ').lower()
  if normalize:
    # NFKD normalize the special characters in the text by separating then into 
    # their base forms. For instance: é = e ´. the encode input parameters are used 
    # to silently ignore any unicode character that cannot be represented. The last part
    # converts the encoded string into a Unicode string by using utf-8 decoder. 
    s = s.normalize("NFKD").str.encode('ascii', 'ignore').str.decode('utf-8')
  return s

def clean_dataset(data, clean=False, shuffle=False, normalize=False, label_prefix='__class__'):
  # deep copy is needed, otherwise copy() will just create a shallow copy referencing to the orginal df
  df=data[['name', 'description']].copy(deep=True)
  df['class'] = label_prefix + data['class'].astype(str) + ' '

  if clean:
    df['name']=df['name'].apply(lambda x: clean_text(x, normalize=normalize))
    df['description']=df['description'].apply(lambda x: clean_text(x, normalize))

  if shuffle:
    df.sample(frac=1).reset_index(drop=True)

  return df

In [12]:
%%time
# pre process the dataset with the functions defined above
df_train_cleaned=clean_dataset(df, clean=True, shuffle=True)
df_test_cleaned=clean_dataset(df_test, clean=True, shuffle=True)

CPU times: user 4.25 s, sys: 255 ms, total: 4.51 s
Wall time: 4.63 s


In [13]:
df[:2]

Unnamed: 0,class,name,description,class_name
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...,Company
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...,Company


In [14]:
df_train_cleaned[:2]

Unnamed: 0,name,description,class
0,e . d . abbott ltd,abbott of farnham e d abbott limited was a br...,__class__1
1,schwan-stabilo,schwan-stabilo is a german maker of pens for ...,__class__1


In [15]:
from requests.api import head
# fastText classifier needs to read the data from a file stored in the disk
train_file=save_folder + '/dbpedia_train.csv'
df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['class', 'name', 'description'])

test_file=save_folder + '/dbpedia_test.csv'
df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['class', 'name', 'description'])

In [16]:
# using fastText for feature extraction and training
%%time
from fasttext import train_supervised 

# Input args: training file, model name, label prefix (__class__)
model=train_supervised(input=train_file, label="__class__", lr=1.0, epoch=75, loss='ova', wordNgrams=2, dim=200, thread=2, verbose=100)


CPU times: user 1h 1min 23s, sys: 19.9 s, total: 1h 1min 43s
Wall time: 35min 33s


In [17]:
for k in range(1,6):
# Precision at k measures the proportion of correct labels among the top k 
# predicted labels. It evaluates how well the model's predictions match the true 
# labels.

# Recall at k measures the proportion of relevant labels that are successfully
# predicted among the top k predicted labels. It evaluates how well the model 
# captures all the relevant labels.
  results=model.test(test_file, k=k)
  print(f"Test Samples: {results[0]} Precision@{k} : {results[1]*100:2.4f} Recall@{k} : {results[2]*100:2.4f}")


Test Samples: 70000 Precision@1 : 92.4271 Recall@1 : 92.4271
Test Samples: 70000 Precision@2 : 47.9421 Recall@2 : 95.8843
Test Samples: 70000 Precision@3 : 32.3548 Recall@3 : 97.0643
Test Samples: 70000 Precision@4 : 24.3971 Recall@4 : 97.5886
Test Samples: 70000 Precision@5 : 19.5494 Recall@5 : 97.7471
