In [1]:
import pandas as pd

from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics

In [2]:
def extract_sentence_features(df):
    sentence_length = len(df)

    for i in range(sentence_length):
        token = df.iloc[i]['Token']

        features = {
            'bias': 1.0,
            'token.lower()': token.lower(),
            'token.isupper()': token.isupper(),
            'token.istitle()': token.istitle(),
            'token.isdigit()': token.isdigit()
        }

        if i > 0:
            previous_token = df.iloc[i-1]['Token']
            features.update({
                'previous_token.lower()': previous_token.lower(),
                'previous_token.isupper()': previous_token.isupper(),
                'previous_token.istitle()': previous_token.istitle(),
                'previous_token.isdigit()': previous_token.isdigit()
            })
        else:
            features['BOS'] = True

        if i < sentence_length - 1:
            posterior_token = df.iloc[i+1]['Token']
            features.update({
                'posterior_token.lower()': posterior_token.lower(),
                'posterior_token.isupper()': posterior_token.isupper(),
                'posterior_token.istitle()': posterior_token.istitle(),
                'posterior_token.isdigit()': posterior_token.isdigit()
            })
        else:
            features['EOS'] = True

        yield features

In [3]:
def prepare(df, include_y=False):
    X, y = [], []
    for _, group_df in df.groupby(['Sentence_Index']):
        X.append(list(extract_sentence_features(group_df)))
        if include_y:
            y.append(group_df['Tag'])
    if include_y:
        return X, y
    return X

In [4]:
# train_df = pd.read_csv('/kaggle/input/crftrain200/training.csv')
# validation_df = pd.read_csv('/kaggle/input/crftrain200/dev.csv')
train_df = pd.read_csv('training.csv')
validation_df = pd.read_csv('dev.csv')

In [5]:
train_df.head()

Unnamed: 0,Sentence_Index,Token,Tag
0,0,তার,O
1,0,মৃত্যুর,O
2,0,দশ,O
3,0,দিন,O
4,0,"পর,",O


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191897 entries, 0 to 191896
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Sentence_Index  191897 non-null  int64 
 1   Token           191897 non-null  object
 2   Tag             191897 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.4+ MB


In [7]:
validation_df.head()

Unnamed: 0,Sentence_Index,Token,Tag
0,0,তিনি,O
1,0,যুবক,O
2,0,হিসেবে,O
3,0,শেফিল্ড,B-GRP
4,0,বুধবার,I-GRP


In [8]:
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10333 entries, 0 to 10332
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Sentence_Index  10333 non-null  int64 
 1   Token           10333 non-null  object
 2   Tag             10333 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.3+ KB


In [9]:
train_df['Token'].fillna('', inplace=True)
validation_df['Token'].fillna('', inplace=True)


In [10]:
train_df.shape

(191897, 3)

In [11]:
validation_df.shape

(10333, 3)

In [12]:
X_train, y_train = prepare(train_df, include_y=True)
X_validation, y_validation = prepare(validation_df, include_y=True)


  for _, group_df in df.groupby(['Sentence_Index']):
  for _, group_df in df.groupby(['Sentence_Index']):


In [13]:
len(X_train)

15300

In [14]:
len(y_train)

15300

In [15]:
len(X_validation)

800

In [16]:
len(y_validation)

800

In [17]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [18]:
crf.fit(X_train, y_train)

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

In [None]:
y_pred = crf.predict(X_validation)

In [None]:
print(metrics.flat_classification_report(y_validation, y_pred))