In [2]:
!pip install allennlp

Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/64/32/d6d0a93a23763f366df2dbd4e007e45ce4d2ad97e6315506db9da8af7731/allennlp-0.8.2-py3-none-any.whl (5.6MB)
[K    100% |████████████████████████████████| 5.6MB 4.3MB/s 
[?25hCollecting sqlparse==0.2.4 (from allennlp)
  Downloading https://files.pythonhosted.org/packages/65/85/20bdd72f4537cf2c4d5d005368d502b2f464ede22982e724a82c86268eda/sqlparse-0.2.4-py2.py3-none-any.whl
Collecting conllu==0.11 (from allennlp)
  Downloading https://files.pythonhosted.org/packages/d4/2c/856344d9b69baf5b374c395b4286626181a80f0c2b2f704914d18a1cea47/conllu-0.11-py2.py3-none-any.whl
Collecting moto==1.3.4 (from allennlp)
[?25l  Downloading https://files.pythonhosted.org/packages/ee/8f/7b36e81ff067d0e7bf90f7210b351c0cfe6657f79fa4dcb0cb4787462e05/moto-1.3.4-py2.py3-none-any.whl (548kB)
[K    100% |████████████████████████████████| 552kB 17.3MB/s 
[?25hCollecting parsimonious==0.8.0 (from allennlp)
  Downloading https://files.p

In [3]:
import itertools
from allennlp.data.tokenizers import Token
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data.fields import TextField, LabelField
from allennlp.data import Instance
from typing import Iterator, List, Dict
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding, TokenCharactersEncoder
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder
from allennlp.data.vocabulary import Vocabulary
from allennlp.nn.util import get_text_field_mask
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper, BagOfEmbeddingsEncoder
from allennlp.data.iterators import BasicIterator
from allennlp.training.trainer import Trainer
import torch.optim as optim
from allennlp.predictors.predictor import Predictor

import torch
import pickle

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


### For AllenNLP to work, usually we need to find two things: data reader and model.

In [0]:
class JobReader(DatasetReader):
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        
    def text_to_instance(self, tokens: List[Token], tag: int = None) -> Instance:
        job_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": job_field}
        
        label_field = LabelField(label=str(tag))
        fields["label"] = label_field

        return Instance(fields)
    
    def _read(self, file_path: str) -> Iterator[Instance]:
        with (open(file_path, "rb")) as openfile:
            basic = pickle.load(openfile)
        for each in basic:
            text = [x.strip().split()+['<eos>'] for x in each['text'] if x!='']
            words = list(itertools.chain(*text))
            yield self.text_to_instance([Token(word) for word in words], each['salary_range'])


In [0]:
class LstmClassifier(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings

        self.encoder = encoder

        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.metrics = {
            "accuracy": CategoricalAccuracy()
        }

        self.loss_function = torch.nn.CrossEntropyLoss()

    def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        mask = get_text_field_mask(tokens)

        embeddings = self.word_embeddings(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.hidden2tag(encoder_out)

        output = {"logits": logits}
        if label is not None:
            for metric in self.metrics.values():
                metric(logits, label)
            output["loss"] = self.loss_function(logits, label)

        return output
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {metric_name: metric.get_metric(reset) for metric_name, metric in self.metrics.items()}

### To access the data in my Google Drive, I need the following code to attach that path to our working environment.

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
# import pickle
# with open('/content/gdrive/My Drive/2.1_data.pkl', 'rb') as handle:
#     d = pickle.load(handle)

# import numpy as np
# a, b, c = np.split(np.random.permutation(len(d)), [int(0.7*len(d)), int(0.9*len(d))])
# with open('/content/gdrive/My Drive/2.1_train.pkl', 'wb') as handle:
#     pickle.dump([d[i] for i in a], handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open('/content/gdrive/My Drive/2.1_dev.pkl', 'wb') as handle:
#     pickle.dump([d[i] for i in b], handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open('/content/gdrive/My Drive/2.1_test.pkl', 'wb') as handle:
#     pickle.dump([d[i] for i in c], handle, protocol=pickle.HIGHEST_PROTOCOL)



### Now we start the flow of training.

In [0]:
reader = JobReader(token_indexers={"tokens": SingleIdTokenIndexer(lowercase_tokens=True),
                                   "token_characters": TokenCharactersIndexer()})

train_dataset = reader.read('/content/gdrive/My Drive/2.1_train.pkl')
dev_dataset = reader.read('/content/gdrive/My Drive/2.1_dev.pkl')

vocab = Vocabulary.from_instances(train_dataset+dev_dataset, min_count={'tokens': 3})

10406it [00:15, 660.72it/s]
2974it [00:04, 726.94it/s] 
100%|██████████| 13380/13380 [01:14<00:00, 180.44it/s]


In [7]:
reader = JobReader(token_indexers={"tokens": SingleIdTokenIndexer(lowercase_tokens=True),
                                   "token_characters": TokenCharactersIndexer()})

test_dataset = reader.read('/content/gdrive/My Drive/tmp/2.1_test.pkl')

vocab = Vocabulary.from_files("/content/gdrive/My Drive/tmp/vocabulary")

1487it [00:03, 427.63it/s]


In [0]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                           embedding_dim=50,
                           pretrained_file="https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.50d.txt.gz")

temp1 = Embedding(num_embeddings=vocab.get_vocab_size('token_characters'),
                  embedding_dim=10)
temp2 = PytorchSeq2VecWrapper(torch.nn.GRU(10, 25, 2, batch_first=True, 
                                           dropout=0, bidirectional=False))

character_encoding = TokenCharactersEncoder(embedding=temp1, encoder=temp2)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding,
                                          "token_characters": character_encoding})
lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(75, 40, 2, batch_first=True, 
                                           dropout=0.25, bidirectional=True))
model = LstmClassifier(word_embeddings, lstm, vocab)

In [0]:
with open("/content/gdrive/My Drive/tmp/model.th", 'rb') as f:
    model.load_state_dict(torch.load(f))

In [0]:
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

iterator = BasicIterator(batch_size=32)
iterator.index_with(vocab)

model = model.cuda(0)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=dev_dataset,
                  patience=10,
                  num_epochs=20,
                  cuda_device=0)

trainer.train()

In [0]:
with open("/content/gdrive/My Drive/model.th", 'wb') as f:
    torch.save(model.state_dict(), f)
vocab.save_to_files("/content/gdrive/My Drive/vocabulary")

### Here we start to predict for test dataset.

In [11]:
import numpy as np
pred_label = []
predictor = Predictor(model, dataset_reader=reader)
for i in range(len(test_dataset)):
    if i % 1000==0:
        print(i)
    try:
        pred = predictor.predict_instance(test_dataset[i])
        pred_label.append(np.argmax(pred['logits']))
    except KeyError:
        print(i)
        pred_label.append(0)

0
865
1000


In [0]:
pred_salary = [vocab.get_token_from_index(i, 'labels') for i in pred_label]
true_salary = [each['label'].label for each in test_dataset]
np.mean(np.array(pred_salary)==np.array(true_salary))

### Try things out:

In [0]:
import csv
with open('pred.csv', mode='w') as f:
    w = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    w.writerow(['pred', 'true'])
    for i in range(len(pred_salary)):
        w.writerow([pred_salary[i], true_salary[i]])

In [54]:
test_dataset = reader.read('/content/gdrive/My Drive/tmp/2.1_test.pkl')

1487it [00:01, 917.24it/s]


In [0]:
with open('/content/gdrive/My Drive/tmp/2.1_test.pkl', 'rb') as f:
    d = pickle.load(f)
true_salary = [str(each['salary_range']) for each in d]

In [32]:

evaluations(np.array(true_salary), np.array(pred_salary))

Normalized confusion matrix
[[0.12 0.44 0.24 0.   0.08 0.04 0.04 0.04 0.   0.   0.   0.   0.  ]
 [0.03 0.21 0.28 0.17 0.14 0.1  0.   0.07 0.   0.   0.   0.   0.  ]
 [0.02 0.1  0.18 0.2  0.22 0.16 0.1  0.   0.   0.   0.02 0.   0.  ]
 [0.   0.07 0.18 0.21 0.17 0.17 0.09 0.09 0.   0.02 0.   0.   0.  ]
 [0.   0.03 0.03 0.08 0.22 0.28 0.21 0.09 0.03 0.03 0.01 0.   0.  ]
 [0.   0.   0.01 0.04 0.15 0.25 0.24 0.11 0.12 0.05 0.01 0.   0.01]
 [0.   0.   0.   0.01 0.08 0.15 0.27 0.22 0.15 0.09 0.02 0.   0.  ]
 [0.   0.   0.01 0.   0.07 0.13 0.22 0.22 0.21 0.1  0.03 0.   0.  ]
 [0.   0.   0.   0.   0.03 0.05 0.11 0.22 0.33 0.17 0.07 0.   0.  ]
 [0.   0.   0.   0.01 0.02 0.04 0.09 0.13 0.35 0.27 0.07 0.01 0.01]
 [0.   0.   0.   0.   0.02 0.02 0.05 0.14 0.24 0.29 0.23 0.02 0.02]
 [0.   0.   0.   0.   0.   0.   0.04 0.08 0.27 0.19 0.23 0.04 0.15]
 [0.   0.   0.   0.   0.   0.   0.   0.08 0.17 0.5  0.08 0.   0.17]]
Precision 0.41
Recall 0.38
F1 Score 0.40
RMSE 18.62


In [15]:
vocab.get_token_from_index(5, 'labels')

'95.0'

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import itertools

#Evaluation of Model - Confusion Matrix Plot
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


# # Compute confusion matrix
# cnf_matrix = confusion_matrix(y_test, y_pred)
# np.set_printoptions(precision=2)

# # Plot non-normalized confusion matrix
# plt.figure()
# plot_confusion_matrix(cnf_matrix, classes=['Forged','Authorized'],
#                       title='Confusion matrix, without normalization')

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix

def evaluations(y_test, y_pred):

    # Change type to floats
    y_pred = y_pred.astype('float')
    y_test = y_test.astype('float')

    ### Evaluating the Model using the Testing Dataset
    # Plot normalized confusion matrix
    labels = ['50-60', '60-70', '70-80', '80-90', '90-100', '100-110', '110-120', '120-130', '130-140', '140-150', '150-160', '160-170', '170+']

    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)

    # Plot normalized confusion matrix
    # Plot normalized confusion matrix
    fig, ax = plt.subplots( nrows=1, ncols=1 , figsize=(8, 8)) 
    plot_confusion_matrix(cnf_matrix, normalize = True, classes=labels, title='Normalized Confusion Matrix')
    fig.savefig('norm_conf_matrix.png')
    plt.close(fig)
    
    # extracting true positives, false negatives, and false positives
    cm = confusion_matrix(y_test, y_pred)
    cm = np.asmatrix(cm)
    tp = np.trace(cm)
    fn = np.triu(cm).sum()-np.trace(cm)
    fp = np.tril(cm).sum()-np.trace(cm)

    # Precision (if we want to minimize false positives)
    precision = tp / (tp + fp)
    print("Precision {:0.2f}".format(precision))

    # Recall (least false negatives)
    recall = tp / (tp + fn)
    print("Recall {:0.2f}".format(recall))

    # F1 Score
    # Harmonic mean of PR, used to indicate a balance between 
    # PR providing each equal weightage, it ranges from 0 to 1. 
    # F1 Score reaches its best value at 1 (perfect PR) and worst at 0.
    # Relations between data’s positive labels and those given by a classifier based on sums of per-text decisions
    f1 = (2*precision*recall)/(precision + recall)
    print("F1 Score {:0.2f}".format(f1))

    ### Calculate RSME
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    rmse = sqrt(mean_squared_error(y_test.astype(np.float), y_pred.astype(np.float)))
    print("RMSE {:0.2f}".format(rmse))

# ------------------------------------------
# Plot a Confusion Matrix 
# from sklearn.metrics import confusion_matrix
# import seaborn as sn
# cm = confusion_matrix(y_val,y_pred)
# cm_df = pd.DataFrame(cm)
# plt.figure(figsize = (10,7))
# sn.heatmap(cm_df, annot=True)