In [3]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (965 kB)
[K     |████████████████████████████████| 965 kB 767 kB/s eta 0:00:01
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6


In [4]:
import pandas as pd

from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics

In [5]:
def extract_sentence_features(df):
    sentence_length = len(df)

    for i in range(sentence_length):
        token = df.iloc[i]['Token']

        features = {
            'bias': 1.0,
            'token.lower()': token.lower(),
            'token.isupper()': token.isupper(),
            'token.istitle()': token.istitle(),
            'token.isdigit()': token.isdigit()
        }

        if i > 0:
            previous_token = df.iloc[i-1]['Token']
            features.update({
                'previous_token.lower()': previous_token.lower(),
                'previous_token.isupper()': previous_token.isupper(),
                'previous_token.istitle()': previous_token.istitle(),
                'previous_token.isdigit()': previous_token.isdigit()
            })
        else:
            features['BOS'] = True

        if i < sentence_length - 1:
            posterior_token = df.iloc[i+1]['Token']
            features.update({
                'posterior_token.lower()': posterior_token.lower(),
                'posterior_token.isupper()': posterior_token.isupper(),
                'posterior_token.istitle()': posterior_token.istitle(),
                'posterior_token.isdigit()': posterior_token.isdigit()
            })
        else:
            features['EOS'] = True

        yield features

In [19]:
def prepare(df, include_y=False):
    X, y = [], []
    for _, group_df in df.groupby(['Sentence_Index']):
        X.append(list(extract_sentence_features(group_df)))
        if include_y:
            y.append(group_df['Tag'])
    if include_y:
        return X, y
    return X

In [8]:
train_df = pd.read_csv('/kaggle/input/crftrain200/training.csv')
validation_df = pd.read_csv('/kaggle/input/crftrain200/dev.csv')


In [9]:
train_df.head()

Unnamed: 0,Sentence_Index,Token,Tag
0,0,তার,O
1,0,মৃত্যুর,O
2,0,দশ,O
3,0,দিন,O
4,0,"পর,",O


In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191897 entries, 0 to 191896
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Sentence_Index  191897 non-null  int64 
 1   Token           191897 non-null  object
 2   Tag             191897 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.4+ MB


In [13]:
validation_df.head()

Unnamed: 0,Sentence_Index,Token,Tag
0,0,তিনি,O
1,0,যুবক,O
2,0,হিসেবে,O
3,0,শেফিল্ড,B-GRP
4,0,বুধবার,I-GRP


In [14]:
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10333 entries, 0 to 10332
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Sentence_Index  10333 non-null  int64 
 1   Token           10333 non-null  object
 2   Tag             10333 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.3+ KB


In [15]:
train_df['Token'].fillna('', inplace=True)
validation_df['Token'].fillna('', inplace=True)


In [16]:
train_df.shape

(191897, 3)

In [17]:
validation_df.shape

(10333, 3)

In [20]:
X_train, y_train = prepare(train_df, include_y=True)
X_validation, y_validation = prepare(validation_df, include_y=True)


In [21]:
len(X_train)

15300

In [22]:
len(y_train)

15300

In [23]:
len(X_validation)

800

In [25]:
len(y_validation)

800

In [26]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [27]:
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [28]:
y_pred = crf.predict(X_validation)

In [29]:
print(metrics.flat_classification_report(y_validation, y_pred))

              precision    recall  f1-score   support

      B-CORP       0.84      0.54      0.65       127
        B-CW       0.65      0.29      0.40       120
       B-GRP       0.75      0.65      0.70       118
       B-LOC       0.68      0.60      0.64       101
       B-PER       0.79      0.72      0.76       144
      B-PROD       0.79      0.52      0.62       190
      I-CORP       0.84      0.70      0.77       122
        I-CW       0.73      0.35      0.47       161
       I-GRP       0.80      0.76      0.78       226
       I-LOC       0.75      0.59      0.66        61
       I-PER       0.82      0.76      0.79       180
      I-PROD       0.86      0.54      0.67       129
           O       0.95      0.99      0.97      8654

    accuracy                           0.93     10333
   macro avg       0.79      0.62      0.68     10333
weighted avg       0.92      0.93      0.92     10333



