In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors, BertWordPieceTokenizer

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
import  matplotlib.pyplot as plt


from sklearn.model_selection import KFold
from tokenizers import BertWordPieceTokenizer

import warnings
warnings.simplefilter('ignore')

In [2]:
data = pd.read_excel('/content/data.xlsx')
data.rename(columns={'texts': 'text'}, inplace=True)
data


Unnamed: 0,text,intent
0,yar777em weldin weldikk bravooooo,1
1,Bravo amine...j'aime,1
2,صراحة bravo أحمد الرحموني .,1
3,Souut w ziiin lee wlh bch tda5llouna f 7iiit,1
4,ANAA N7bha,1
...,...,...
49886,علم موقع نسمة، أن باخرة إيطالية، وصلت في ساعة ...,0
49887,رجع الهم، زايد بلاد بلا راجل، موش سكرتو الحدود...,0
49888,"""نداء الى رئيس الجمهورية:",0
49889,هبط الجيش واقفل الحدود وأعلن الحالة القصوى وكل...,0


In [3]:
#data.intent = data.intent.map({'positive' :  1 , 'negative' : 0})

In [4]:
data[data['text'].isna()]

Unnamed: 0,text,intent
1700,,1
2159,,1
6853,,1
8855,,1
8988,,1
...,...,...
48334,,0
48471,,0
48474,,0
49247,,0


In [5]:
data.dropna(subset=['text'], inplace=True)
data

Unnamed: 0,text,intent
0,yar777em weldin weldikk bravooooo,1
1,Bravo amine...j'aime,1
2,صراحة bravo أحمد الرحموني .,1
3,Souut w ziiin lee wlh bch tda5llouna f 7iiit,1
4,ANAA N7bha,1
...,...,...
49886,علم موقع نسمة، أن باخرة إيطالية، وصلت في ساعة ...,0
49887,رجع الهم، زايد بلاد بلا راجل، موش سكرتو الحدود...,0
49888,"""نداء الى رئيس الجمهورية:",0
49889,هبط الجيش واقفل الحدود وأعلن الحالة القصوى وكل...,0


In [6]:
print(data['intent'].value_counts())


intent
1    35644
0    12582
Name: count, dtype: int64


In [7]:
_1_sample = data[data['intent'] == 1].sample(n=25000, random_state=42)
_0_sample = data[data['intent'] == 0].sample(n=2500, random_state=42)
data = data.drop(_1_sample.index).drop(_0_sample.index)
data

Unnamed: 0,text,intent
1,Bravo amine...j'aime,1
2,صراحة bravo أحمد الرحموني .,1
9,Bravo,1
10,mahleha martek El 7a9 acheb mennek :),1
12,أمنة فاخر أية من الجمال ولكي كل التحية من ليبيا,1
...,...,...
49885,علم موقع نسمة، أن باخرة إيطالية، وصلت في ساعة ...,0
49886,علم موقع نسمة، أن باخرة إيطالية، وصلت في ساعة ...,0
49887,رجع الهم، زايد بلاد بلا راجل، موش سكرتو الحدود...,0
49888,"""نداء الى رئيس الجمهورية:",0


In [8]:
print(data['intent'].value_counts())

intent
1    10644
0    10082
Name: count, dtype: int64


In [9]:
EPOCHS = 5
BATCH_SIZE = 32
MAX_LEN = 192
AUTO = tf.data.experimental.AUTOTUNE

In [10]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):

    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    all_ids = []

    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])

    return np.array(all_ids)

In [11]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts,
        return_attention_masks=False,
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )

    return np.array(enc_di['input_ids'])

In [12]:
#tokenizer = AutoTokenizer.from_pretrained(MODEL)

tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)

In [13]:
def build_model(transformer, max_len=512):

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)

    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(learning_rate=6e-6), loss='binary_crossentropy', metrics=['accuracy','AUC'])

    return model

In [14]:
print(data.columns)

Index(['text', 'intent'], dtype='object')


In [15]:
texts = fast_encode(data.text.values.astype(str), fast_tokenizer, maxlen=MAX_LEN)

ys = data.intent.values

  0%|          | 0/81 [00:00<?, ?it/s]

In [16]:
def create_train(x_train,y_train) :
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    return  train_dataset

def create_valid(x_valid,y_valid) :
    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_valid, y_valid))
        .batch(BATCH_SIZE)
        .cache()
        .prefetch(AUTO)
    )

    return valid_dataset

def create_test(x_test) :
    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(x_test)
        .batch(BATCH_SIZE)
    )
    return test_dataset

In [None]:
FOLDS = 5
SEED  = 42

skf = KFold(n_splits=FOLDS,shuffle=True,random_state=SEED)

for fold,(train_indices,valid_indices) in enumerate(skf.split(texts,ys)) :
    print() ; print('#'*25)
    print('Fold' , fold+1)
    print('#'*25)

    #transformer_layer = TFAutoModel.from_pretrained(MODEL)
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)

    sv = tf.keras.callbacks.ModelCheckpoint(
        'fold-%i.h5'%fold, monitor='val_loss', verbose=0, save_best_only=True,
        save_weights_only=True, mode='min', save_freq='epoch')

    n_steps = train_indices.shape[0] // BATCH_SIZE
    history = model.fit(
    create_train(texts[train_indices],ys[train_indices]),
    steps_per_epoch=n_steps,
    validation_data=create_valid(texts[valid_indices],ys[valid_indices]),
    epochs=EPOCHS,
    callbacks =  [sv]
    )


    plt.figure(figsize=(15,5))
    plt.plot(np.arange(EPOCHS),history.history['auc'],'-o',label='Train AUC',color='#ff7f0e')
    plt.plot(np.arange(EPOCHS),history.history['val_auc'],'-o',label='Val AUC',color='#1f77b4')
    x = np.argmax( history.history['val_auc'] ); y = np.max( history.history['val_auc'] )
    xdist = plt.xlim()[1] - plt.xlim()[0]; ydist = plt.ylim()[1] - plt.ylim()[0]
    plt.scatter(x,y,s=200,color='#1f77b4'); plt.text(x-0.03*xdist,y-0.13*ydist,'max auc\n%.2f'%y,size=14)
    plt.ylabel('AUC',size=14); plt.xlabel('Epoch',size=14)
    plt.legend(loc=2)
    plt2 = plt.gca().twinx()
    plt2.plot(np.arange(EPOCHS),history.history['loss'],'-o',label='Train Loss',color='#2ca02c')
    plt2.plot(np.arange(EPOCHS),history.history['val_loss'],'-o',label='Val Loss',color='#d62728')
    x = np.argmin( history.history['val_loss'] ); y = np.min( history.history['val_loss'] )
    ydist = plt.ylim()[1] - plt.ylim()[0]
    plt.scatter(x,y,s=200,color='#d62728'); plt.text(x-0.03*xdist,y+0.05*ydist,'min loss',size=14)
    plt.ylabel('Loss',size=14)
    plt.title('FOLD %i Distilbert-base-multilingual-cased'%
                (fold+1),size=18)
    plt.legend(loc=3)
    plt.show()


#########################
Fold 1
#########################


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Epoch 1/5


# Testing :

In [None]:
test = pd.read_csv('test_data.csv')

test_texts = fast_encode(test.text.values.astype(str), fast_tokenizer, maxlen=MAX_LEN)

test_ys = test.intent.values

In [None]:
results = model.predict(create_test(test_texts))
for i,result in enumerate(results) :
    if result > 0.5 :
        results[i] = 1
    else :
        results[i] = 0

In [None]:
confusion_matrix(test_ys, results)

In [None]:
print(classification_report(test_ys, results))

In [None]:
all_ids = []
encs = fast_tokenizer.encode_batch(['j adore ','khedmtkom behyaa barcha','service khayeb'])
all_ids.extend([enc.ids for enc in encs])


test_data = create_test(np.array(all_ids))

predictions = model.predict(test_data)
#print(predictions*10)
for prediction in predictions :
    print(prediction)

#  Wrong Predictions :

In [None]:
for i,prediction in enumerate(results) :
    if prediction != test_ys[i] :
        print(test['text'][i])

# Saving the model :

In [None]:
import pickle


# Save the Modle to file in the current working directory

Pkl_Filename = "pickled_model.pkl"

with open(Pkl_Filename, 'wb') as file:
    pickle.dump(model, file)

# Semi supervized Pseudo Labelling :

In [None]:
extra_data = pd.read_csv('../input/vneuron/oreedoo_data.csv')

In [None]:
l = []
for i in range(len(extra_data)) :
    l.append(str(extra_data['r'][i]))

In [None]:
model.load_weights('fold-2.h5')

In [None]:
all_ids = []
encs = fast_tokenizer.encode_batch(l)
all_ids.extend([enc.ids for enc in encs])


test_data = create_test(np.array(all_ids))

predictions = model.predict(test_data)
#print(predictions*10)
for prediction in predictions :
    print(prediction)

In [None]:
to_use_label_1 = []
to_use_label_0 = []
for i,prediction in enumerate(predictions) :
    if prediction > 0.8 :
        to_use_label_1.append(i)
    if prediction < 0.001 :
        to_use_label_0.append(i)

In [None]:
print('Added positive labels :',len(to_use_label_1))
print('Added negative labels :',len(to_use_label_0))

In [None]:
print('Added Positive examples :')
for i in to_use_label_1 :
    print(extra_data['r'][i])

In [None]:
print('Added Negative examples :')
for i in to_use_label_0 :
    print(extra_data['r'][i])