In [1]:
import numpy as np
import logging
from gensim.models import Word2Vec
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

In [2]:
def get_data(path):
    texts = []
    labels = []
    read_file = open(path, 'r')
    for line in read_file:
        line = line.replace('\n','')
        items = line.split('\t')
        texts.append(items[0])
        labels.append(items[1])
    read_file.close()

    return texts, labels

In [3]:
path = 'NER_Malayalam.txt'

In [4]:
words, tags = get_data(path)

In [5]:
words = [words]
labels = set(tags)

In [6]:
model_wv = Word2Vec(words, size=300, window=1, min_count=1, workers=4)

In [7]:
m,n = np.shape(words)
X = []
for i in range(n):
    X.append(model_wv.wv[words[0][i]])

In [8]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(tags)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [10]:
print(np.shape(X_train),np.shape(y_train))
print(np.shape(X_test), np.shape(y_test))

(7059, 300) (7059,)
(3026, 300) (3026,)


In [11]:
#clf = SVC()
#clf = KNeighborsClassifier(3)
#clf = DecisionTreeClassifier(max_depth=5)
#clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
#clf = MLPClassifier()
clf = GaussianNB()

In [12]:
clf.fit(X_train, y_train) 

GaussianNB(priors=None, var_smoothing=1e-09)

In [13]:
y_pred = clf.predict(X_test)

In [14]:
pred_labels = le.inverse_transform(y_pred)

In [15]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

In [16]:
print("Accuracy = {0:.2f}".format(accuracy))
print("Precision = {0:.2f}".format(precision))
print("Recall = {0:.2f}".format(recall))

Accuracy = 0.35
Precision = 0.85
Recall = 0.35


In [18]:
text = "ബിഗ് ബോസിൻറെ ആദ്യഭാഗം നവംബർ 2006 മുതൽ ജനുവരി 2007 വരെ ആണ് സംപ്രേഷണം ചെയ്തത് ."

In [19]:
tokens = text.split()

In [20]:
test_data = []
for i in range(len(tokens)):
    test_data.append(model_wv.wv[tokens[i]])

In [21]:
yp = clf.predict(test_data)

In [22]:
le.inverse_transform(yp)

array(['name', 'number', 'name', 'name', 'name', 'location', 'name',
       'location', 'other', 'other', 'name', 'name', 'other'],
      dtype='<U12')

In [23]:
from pyfasttext import FastText

In [24]:
model_ft = FastText()
model_ft.skipgram(input='NER_Malayalam.txt', output='model_ft', epoch=10, lr=0.1, dim=300)

In [25]:
X = []
for i in range(len(words[0])):
    X.append(model_ft[words[0][i]])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [27]:
print(np.shape(X_train),np.shape(y_train))
print(np.shape(X_test), np.shape(y_test))

(7059, 300) (7059,)
(3026, 300) (3026,)


In [28]:
#clf = SVC()
clf = KNeighborsClassifier(3)
#clf = DecisionTreeClassifier(max_depth=5)
#clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
#clf = MLPClassifier()
#clf = GaussianNB()

clf.fit(X_train, y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [29]:
y_pred = clf.predict(X_test)

In [30]:
pred_labels = le.inverse_transform(y_pred)

In [31]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

  'precision', 'predicted', average, warn_for)


In [32]:
print("Accuracy = {0:.2f}".format(accuracy))
print("Precision = {0:.2f}".format(precision))
print("Recall = {0:.2f}".format(recall))

Accuracy = 0.87
Precision = 0.85
Recall = 0.87


In [33]:
text = "ബിഗ് ബോസിൻറെ ആദ്യഭാഗം നവംബർ 2006 മുതൽ ജനുവരി 2007 വരെ ആണ് സംപ്രേഷണം ചെയ്തത് ."

In [34]:
tokens = text.split()

In [35]:
test_data = []
for i in range(len(tokens)):
    test_data.append(model_ft[tokens[i]])

In [36]:
yp = clf.predict(test_data)

In [37]:
le.inverse_transform(yp)

array(['other', 'other', 'other', 'other', 'other', 'other', 'other',
       'number', 'other', 'other', 'other', 'other', 'other'],
      dtype='<U12')

In [38]:
import numpy as np
from keras.models import Sequential
from keras import optimizers
from keras.layers.recurrent import LSTM, SimpleRNN, GRU
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
from keras import backend as K
import numpy as np
import re
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import keras
from keras.utils import to_categorical

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [40]:
path = 'NER_Malayalam.txt'

In [41]:
def get_data(path):
    texts = []
    labels = []
    read_file = open(path, 'r')
    for line in read_file:
        line = line.replace('\n','')
        if line == 'newline':
            pass
        else:
            items = line.split('\t')
            texts.append(items[0])
            labels.append(items[1])
    read_file.close()

    return texts, labels

In [42]:
words, tags = get_data(path)
words = [words]
wvmodel = Word2Vec(words, size=300, window=1, min_count=1)

In [43]:
X = []
for i in range(len(words[0])):
    X.append(wvmodel.wv[words[0][i]])


m, n = np.shape(X)
X = np.reshape(X, (m,n,1))

In [44]:
label_encoder = LabelEncoder()
label_encoder.fit(tags)
y = label_encoder.transform(tags)
y = to_categorical(y, len(set(y)))

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [46]:
batch_size = 32

In [47]:
model = Sequential()
model.add(LSTM(32, activation='tanh', input_shape = X_train.shape[1:]))
model.add(Dense(9, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 32)                4352      
_________________________________________________________________
dense_1 (Dense)              (None, 9)                 297       
Total params: 4,649
Trainable params: 4,649
Non-trainable params: 0
_________________________________________________________________


In [48]:
model.fit(X_train, y_train,  batch_size = batch_size, epochs = 10, validation_data = (X_test, y_test))

Train on 7059 samples, validate on 3026 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fdbd62b7470>

In [49]:
score = model.evaluate(X_test, y_test)
print("Accuracy = ",score[1]*100)

Accuracy =  83.47653669390706


In [50]:
y_pred = model.predict_classes(X_test)
pred_labels = le.inverse_transform(y_pred)
print(pred_labels)

['other' 'other' 'other' ... 'other' 'other' 'other']


In [51]:
text = "ബിഗ് ബോസിൻറെ ആദ്യഭാഗം നവംബർ 2006 മുതൽ ജനുവരി 2007 വരെ ആണ് സംപ്രേഷണം ചെയ്തത് ."
tokens = text.split()

In [52]:
test_data = []
for i in range(len(tokens)):
    test_data.append(wvmodel[tokens[i]])

  This is separate from the ipykernel package so we can avoid doing imports until


In [53]:
mt, nt = np.shape(test_data)
test_data = np.reshape(test_data, (mt, nt,1))

In [54]:
y_pred = model.predict_classes(test_data)
pred_labels = le.inverse_transform(y_pred)
print(pred_labels)

['other' 'other' 'other' 'other' 'other' 'other' 'other' 'other' 'other'
 'other' 'other' 'other' 'other']
