# Neural network

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.preprocessing import LabelEncoder

from datetime import datetime
import sklearn
import os
import sys
sys.path.append('/usr/lib/python3.7/site-packages/')

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

In [12]:
def find_data(data_dir):
    tensors_path, labels_path = '', ''
    print(data_dir)
    for dirpath, dirnames, filenames in os.walk(data_dir):
        print (dirpath)
        for f in filenames:
            if f == 'metadata.tsv':
                labels_path = os.path.join(dirpath, f)
            elif f == 'tensors.tsv':
                tensors_path = os.path.join(dirpath, f)
    if not (tensors_path and labels_path):
        raise RuntimeError(f'Could not find required files!')
    return tensors_path, labels_path
    

def load_part_data(data_dir):
    X_path, y_path = find_data(data_dir)
    print(X_path)
    y = pd.read_csv(y_path, sep=':', encoding='utf-8', header=None).rename(columns={0:'job', 1:'name'})
    X = pd.read_csv(X_path, sep='\t', encoding='utf-8', header=None)
    return X, y


def load_data(data_dir, validation=True):
    X_train, y_train = load_part_data(os.path.join(data_dir, 'train'))
    X_valid, y_valid = load_part_data(os.path.join(data_dir, 'validation'))
    X_test, y_test = load_part_data(os.path.join(data_dir, 'test'))
    if not validation: 
        X_train = X_train.append(X_valid, ignore_index=True)
        y_train = y_train.append(y_valid, ignore_index=True)
        return X_train, y_train, X_test, y_test
    
    return X_train, y_train, X_test, y_test, X_valid, y_valid

In [13]:
X_train, y_train, X_test, y_test, X_valid, y_valid = load_data(r'C:\Users\Paulina\Downloads\mean_data.tar\mean_data\BERT\ONET', validation=True)
len(X_train), len(y_train), len(X_test), len(y_test), len(X_valid), len(y_valid)

C:\Users\Paulina\Downloads\mean_data.tar\mean_data\BERT\ONET\train
C:\Users\Paulina\Downloads\mean_data.tar\mean_data\BERT\ONET\train
C:\Users\Paulina\Downloads\mean_data.tar\mean_data\BERT\ONET\train\tensors.tsv
C:\Users\Paulina\Downloads\mean_data.tar\mean_data\BERT\ONET\validation
C:\Users\Paulina\Downloads\mean_data.tar\mean_data\BERT\ONET\validation
C:\Users\Paulina\Downloads\mean_data.tar\mean_data\BERT\ONET\validation\tensors.tsv
C:\Users\Paulina\Downloads\mean_data.tar\mean_data\BERT\ONET\test
C:\Users\Paulina\Downloads\mean_data.tar\mean_data\BERT\ONET\test
C:\Users\Paulina\Downloads\mean_data.tar\mean_data\BERT\ONET\test\tensors.tsv


(5033, 5033, 484, 484, 299, 299)

In [14]:
y_train=y_train['job']
y_valid=y_valid['job']
y_test=y_test['job']

In [17]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [19]:
classes = y_train.unique()

model = Sequential([
    Dense(128, activation='relu', input_shape=(len(X_train.columns),)),
    Dense(256, activation='relu'),
    Dense(len(classes), activation='softmax'),
])

model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['accuracy'],
)
le = LabelEncoder()

model.fit(
    X_train.to_numpy(),
    to_categorical(le.fit_transform(y_train)),
    epochs=30,
    batch_size=64,
    validation_data=(X_valid.to_numpy(), to_categorical(le.transform(y_valid))),
    #validation_freq=5,
    verbose=1
)

Train on 5033 samples, validate on 299 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1e3a8267348>

In [20]:
np.argmax(model.predict(X_test), axis=1)



array([2, 3, 9, 7, 9, 1, 7, 9, 9, 7, 7, 7, 2, 1, 7, 7, 7, 9, 7, 7, 7, 7,
       7, 7, 7, 2, 7, 7, 7, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       9, 7, 7, 7, 1, 7, 7, 7, 7, 0, 9, 7, 7, 7, 7, 7, 7, 7, 9, 7, 7, 7,
       7, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 5, 7, 5, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 7, 1, 7, 7, 7, 7, 7, 7, 7, 0, 9, 7,
       7, 7, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 7, 9, 7, 9, 7, 9, 9, 7, 9,
       7, 7, 7, 7, 7, 7, 9, 9, 7, 7, 7, 9, 6, 7, 9, 7, 9, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 9, 9, 9, 7, 7, 7, 7, 7, 9, 7, 7, 7, 7, 7, 7,
       7, 7, 8, 7, 7, 7, 1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 7, 9, 7, 7,
       7, 7, 9, 7, 7, 5, 7, 7, 9, 7, 7, 7, 7, 9, 7, 7, 9, 7, 7, 7, 9, 9,
       7, 9, 7, 7, 7, 7, 9, 7, 9, 1, 7, 9, 7, 7, 9, 7, 7, 7, 7, 7, 7, 9,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 9, 7, 7, 0, 7, 7, 1, 9, 7, 7, 7, 7,
       7, 7, 7, 6, 9, 7, 7, 7, 7, 7, 9, 9, 9, 7, 9, 1, 7, 7, 1, 1, 1, 1,
       8, 3, 7, 7, 7, 7, 3, 3, 3, 3, 7, 7, 7, 7, 7,

# Lime

In [55]:
import lime
import sklearn
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.metrics
import torch

from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

import os
import logging
%matplotlib inline

In [586]:
# zmodyfikowana klasa od Wojtka i Staszka
class BertBaseMultilingualEmbeddingVectorizer:
    def __init__(self, model_name="bert-base-multilingual-cased", cuda=True):
        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name).to(self.device)
        self.model.eval()
        
        self.tokens_tensor = None
        self.segments_tensor = None
        
        self.encoded_layers_ = None
        self.token_embeddings_ = None
        
        self.length=None
        self.output_emb=None
    
    def _tokenize_text(self, text):
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = self.tokenizer.tokenize(marked_text)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [1] * len(tokenized_text)
        
        self.tokens_tensor = torch.tensor([indexed_tokens]).to(self.device)
        self.segments_tensor = torch.tensor([segments_ids]).to(self.device)
        
    def _evaluate(self):
        with torch.no_grad():
            encoded_layers, _ = self.model(self.tokens_tensor, self.segments_tensor)
        #print(encoded_layers)
        self.encoded_layers_ = encoded_layers
    
    def _generate_token_embeddings(self, batch_i=0):
        """
        Convert the hidden state embeddings into single token vectors
        Holds the list of 12 layer embeddings for each token
        Will have the shape: [# tokens, # layers, # features]
        """
        token_embeddings = [] 
        # For each token in the sentence...
        for token_i in range(len(self.encoded_layers_[-1][batch_i])):
            # Holds 12 layers of hidden states for each token 
            hidden_layers = [] 
            # For each of the 12 layers...
            for layer_i in range(len(self.encoded_layers_)):
            # Lookup the vector for `token_i` in `layer_i`
                vec = self.encoded_layers_[layer_i][batch_i][token_i]
                hidden_layers.append(vec)
            token_embeddings.append(hidden_layers)
        self.token_embeddings_ = token_embeddings
    
    def _compress_embeddings (self, counter):
        how="mean_last_layer" 
        if how == "mean_last_layer":
            output_emb[counter,:]=np.array(torch.mean(self.encoded_layers_[-1], 1).squeeze()).reshape((1,-1))
        elif how == "mean_sum_last_4_layers": 
            output_emb[counter,:]= np.array(torch.mean(torch.sum(torch.stack(self.encoded_layers_[-4:]), 0), 1).squeeze()).reshape((1,-1))

    def transform(self, sentence):
        if isinstance(sentence, str):
            sentence=[sentence]
        self.length=len(sentence)
        self.output_emb=np.zeros((self.length, 768))
        counter=0
        for i in sentence:
            self._tokenize_text(i)
            self._evaluate()
            self._generate_token_embeddings()
            self._compress_embeddings(counter)
            counter=counter+1
        return output_emb
    
    def fit_transform(self, sentence):
        if isinstance(sentence, str): #obsługuje przypadek gdy robię predict dla jednego zdania a nie jak w lime dla listy
            sentence=[sentence]
        self.length=len(sentence)
        self.output_emb=np.zeros((self.length, 768))
        counter=0
        for i in sentence:
            self._tokenize_text(i)
            self._evaluate()
            self._generate_token_embeddings()
            self._compress_embeddings(counter)
            counter=counter+1
        return output_emb
            
# transform i fit_transform są takie same ale są wymagane dwie oddzielne klasy przez make_pipeline

In [587]:
bert=BertBaseMultilingualEmbeddingVectorizer()

The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [588]:
from lime import lime_text
from sklearn.pipeline import make_pipeline

c = make_pipeline(bert, model)

In [589]:
#do szybkiego sprawdzania poprawności pipeline'u
print(le.inverse_transform([np.argmax(c.predict_proba('Ale nie można w tym celu łamać prawa, bo za chwilę w podobnej sytuacji może się znaleźć nie tylko morderca-pedofil - powiedział.'))]))

['politycy']


In [590]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=le.inverse_transform(range(10)))

In [None]:
exp = explainer.explain_instance("bycie pierwszym na mecie zawodów to nagroda za ciężki trening",c.predict_proba, num_features=10, top_labels=2)
print(exp.available_labels())

In [None]:
exp.show_in_notebook(text=False)

In [456]:
# u nas nie działa
import eli5
eli5.show_weights(model, vec=bert, top=10,
                  target_names=y_valid.unique())
eli5.show_prediction(clf, twenty_test.data[0], vec=vec,
                     target_names=twenty_test.target_names)

Using TensorFlow backend.


# Machine Learning models (random forest/ xgboost)

In [52]:
import pickle

In [102]:
file = open(r'C:\Users\Paulina\Downloads\modele\modele\bert_xg_onet.pickle.dat', 'rb')

# dump information to that file
data = pickle.load(file)

# close the file
file.close()

In [104]:
data.fit(X_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=1,
              learning_rate=0.01, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, n_estimators=300, n_jobs=-1,
              nthread=None, objective='multi:softprob', random_state=12,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1)

In [105]:
data.predict(X_test)

array(['politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'politycy',
       'politycy', 'politycy', 'politycy', 'politycy', 'polity