<a href="https://colab.research.google.com/github/pras-4795/AIMLColab/blob/master/CodeMixingPosTagger.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import random

def load_data(files):
    data, sent = [], []
    for file in files:
        with open(file, 'r') as rf:
            for line in rf:
                if line.strip() != '':
                    sent.append(line.strip().split('\t'))
                else:
                    if len(sent) > 0:
                        data.append(sent)
                        sent = []
    return data
  
  

In [0]:
fileB = ['/content/drive/My Drive/FB_BN_EN_CR.txt', '/content/drive/My Drive/TWT_BN_EN_CR.txt', '/content/drive/My Drive/WA_BN_EN_CR.txt']
fileH = ['/content/drive/My Drive/FB_HI_EN_CR.txt', '/content/drive/My Drive/TWT_HI_EN_CR.txt', '/content/drive/My Drive/WA_HI_EN_CR.txt']
fileT = ['/content/drive/My Drive/FB_TE_EN_CR.txt', '/content/drive/My Drive/TWT_TE_EN_CR.txt', '/content/drive/My Drive/WA_TE_EN_CR.txt']

dataB = load_data(fileB)
dataH = load_data(fileH)
dataT = load_data(fileT)

In [0]:
## Here I'm using whole data to train the model
random.seed(7)
random.shuffle(dataB)
train_sentsB = dataB[:]

random.seed(7)
random.shuffle(dataH)
train_sentsH = dataH[:]

random.seed(7)
random.shuffle(dataT)
train_sentsT = dataT[:]

In [0]:
def sent2features(sent): 
    return [features(sent, i) for i in range(len(sent))]
  
def sent2pos(sent):
  return [pos_tag for token, language_label, pos_tag in sent]



In [0]:
def features(sentence, index):
  return{
  'word': sentence[index],
  'is_first': index == 0,
  'is_last': index == len(sentence) - 1,
  'is_capitalized': sentence[index][0].upper() == sentence[index][0],
  'is_all_caps': sentence[index][0].upper() == sentence[index],
  'is_all_lower': sentence[index][0].lower() == sentence[index],
  'prefix-1': sentence[index][0][0],
  'prefix-2': sentence[index][0][:2],
  'prefix-3': sentence[index][0][:3],
  'suffix-1': sentence[index][0][-1],
  'suffix-2': sentence[index][0][-2:],
  'suffix-3': sentence[index][0][-3:],
  'prev_word': '' if index == 0 else sentence[index - 1],
  'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
  'has_hyphen': '-' in sentence[index],
  'is_numeric': sentence[index][0].isdigit(),
  'capitals_inside': sentence[index][0][1:].lower() != sentence[index][0][1:]
  }


In [0]:
X_trainB = [sent2features(sent) for sent in train_sentsB]
y_trainB = [sent2pos(sent) for sent in train_sentsB]

X_trainH = [sent2features(sent) for sent in train_sentsH]
y_trainH = [sent2pos(sent) for sent in train_sentsH]

X_trainT = [sent2features(sent) for sent in train_sentsT]
y_trainT = [sent2pos(sent) for sent in train_sentsT]

In [8]:
## Telugu Lang Trainer
trainerT = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_trainT, y_trainT):
    trainerT.append(xseq, yseq)
    
trainerT.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainerT.params()

trainerT.train('Telugu.crfsuite')

taggerT = pycrfsuite.Tagger()
taggerT.open('Telugu.crfsuite')

<contextlib.closing at 0x7fda19aad128>

In [9]:
## Hindi Lang Trainer
trainerH = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_trainH, y_trainH):
    trainerH.append(xseq, yseq)
    
trainerH.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainerH.params()

trainerH.train('Hindi.crfsuite')

taggerH = pycrfsuite.Tagger()
taggerH.open('Hindi.crfsuite')

<contextlib.closing at 0x7fda19aad438>

In [10]:
## Bengali Lang Trainer
trainerB = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_trainB, y_trainB):
    trainerB.append(xseq, yseq)
    
trainerB.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainerB.params()

trainerB.train('Bengali.crfsuite')

taggerB = pycrfsuite.Tagger()
taggerB.open('Bengali.crfsuite')

<contextlib.closing at 0x7fda19aad748>

In [0]:
## use tagger to predict the values:
## EX: print(tagger.tag(sent2features(example_sent)))