## Text Classification
- Create a custom classifier with textblob

In [1]:
!pip install pandas pyarrow -q

In [2]:
import pandas as pd
# dataset
# hf: https://huggingface.co/datasets/nickmuchi/financial-classification
df = pd.read_parquet('financial_classification.parquet', columns=['text', 'labels'])
df.head(5)

Unnamed: 0,text,labels
0,Finnish airline Finnair is starting the tempor...,0
1,The corresponding increase in the share capita...,1
2,In the third quarter of fiscal 2008 Efore swun...,0
3,"ALEXANDRIA , Va. , Oct. 15 -- Aaron Moss of Ha...",1
4,Vaisala Oyj Stock exchange release 26.03.2010 ...,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4551 entries, 0 to 4550
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    4551 non-null   object
 1   labels  4551 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 71.2+ KB


In [4]:
import string
df['text'] = df['text'].apply(lambda x:''.join([i for i in x if i not in string.punctuation])) # remove punctuation

In [5]:
df = df[['text','labels']]

In [6]:
df[:5]

Unnamed: 0,text,labels
0,Finnish airline Finnair is starting the tempor...,0
1,The corresponding increase in the share capita...,1
2,In the third quarter of fiscal 2008 Efore swun...,0
3,ALEXANDRIA Va Oct 15 Aaron Moss of Hampshir...,1
4,Vaisala Oyj Stock exchange release 26032010 kl...,1


In [7]:
# Shuffle dataframe
df = df.sample(frac=1)
ratio = 0.80
 
total_rows = df.shape[0]
train_size = int(total_rows*ratio)
 
# Split data into test and train
train = df[0:train_size]
test = df[train_size:]

print('train:', train.shape)
print('test: ', test.shape)

train: (3640, 2)
test:  (911, 2)


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3640 entries, 1001 to 1357
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3640 non-null   object
 1   labels  3640 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 85.3+ KB


In [10]:
train['text'][:5]

1001    According to the notification  the holdings of...
2607    Mr Mikko Saavalainen  head of Comptel s Global...
2248    Sponda Plc s Annual General Meeting decided on...
4536    Okmetic s products are based on hightech exper...
3582    Finland s leading metals group Outokumpu said ...
Name: text, dtype: object

In [12]:
# formatting data  
# fitting to textblob classifier
data_to_train = [(row['text'], row['labels']) for i, row in train.iterrows()]
data_to_train[:3]

[('According to the notification  the holdings of Ameriprice Inc and its group companies are now in total 808973 shares  which represent 3582  of Tekla \x93 s shares and voting rights ',
  1),
 ('Mr Mikko Saavalainen  head of Comptel s Global Sales concludes   Gibtelecom provides a perfect illustration of the variety of business  technical and regulatory challenges operators are facing in their OSS today ',
  1),
 ('Sponda Plc s Annual General Meeting decided on 23 March 2005 to establish a Shareholders  Nomination Committee to prepare proposals for the Annual General Meeting in 2006 on the composition of the Board of Directors and their remuneration ',
  1)]

In [13]:
from textblob.classifiers import NaiveBayesClassifier

In [14]:
fin_cl = NaiveBayesClassifier(data_to_train)

In [15]:
fin_cl.classify('At present , the trade mark Pit-Produkt is little-known outside the North-West of Russia .')

1

In [16]:
fin_cl.classify('Operating profit of the Asian plants grew markedly .')

2

In [17]:
# list out labels
fin_cl.labels()

[1, 2, 0]

In [19]:
# test data
data_to_test = [(row['text'], row['labels']) for i, row in test.iterrows()]
data_to_test[:3]

[('The total value of the agreement is USD4 0 m  the company said ', 1),
 ('The effect of the savings will be noticeable as of the beginning of 2010 ',
  2),
 ('One of the opportunities would be to bring all Baltic meat processing to Rakvere  like processing was concentrated at the Vantaa meatpacking unit in Finland  the paper wrote ',
  1)]

In [20]:
# check probabilty distribution
prob_fin = fin_cl.prob_classify([t for (t,l) in data_to_test])
prob_fin

<ProbDist with 3 samples>

In [21]:
prob_fin.max()

1

In [22]:
round(prob_fin.logprob('no compensation for its news opinions or distributions'), 4)

-1e+300

In [23]:
fin_cl.accuracy(data_to_test)

0.7058177826564215

In [24]:
# features info
fin_cl.show_informative_features()

Most Informative Features
          contains(down) = True                0 : 2      =     82.5 : 1.0
          contains(fell) = True                0 : 2      =     46.0 : 1.0
     contains(decreased) = True                0 : 1      =     43.8 : 1.0
           contains(won) = True                2 : 1      =     36.5 : 1.0
          contains(rose) = True                2 : 1      =     31.9 : 1.0
          contains(fall) = True                0 : 1      =     31.7 : 1.0
   contains(temporarily) = True                0 : 1      =     31.7 : 1.0
      contains(compared) = True                0 : 1      =     30.5 : 1.0
          contains(grew) = True                2 : 1      =     29.6 : 1.0
       contains(dropped) = True                0 : 1      =     28.7 : 1.0


#### Use trained classifier in textblob

In [25]:
from textblob import TextBlob
sample_text = TextBlob('After the transaction 30.65 % of the shares of Alpha farmacy goes to 31.12 % .', classifier=fin_cl)
sample_text.classify()

1

In [31]:
sample_text2 = TextBlob('The validation of the savings will be noted by the mid july', classifier=fin_cl)
sample_text2.classify()

1

In [29]:
extracted = fin_cl.extract_features(sample_text)
list(extracted.items())[:4]

[('contains(reference)', False),
 ('contains(validation)', False),
 ('contains(midJuly)', False),
 ('contains(seventh)', False)]