In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import random
import wordcloud
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, Model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dropout, Dense, Conv2D, MaxPooling2D, Reshape, Lambda, Permute
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.backend import clear_session
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

def clean_text(tweet):
    #remove emoji
    emoji = re.compile("["
                        u'\U0001F600-\U0001F64F'  # emoticons
                        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
                        u'\U0001F680-\U0001F6FF'  # transport & map symbols
                        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
                        u'\U00002702-\U000027B0'
                        u'\U000024C2-\U0001F251'
                        "]+", flags=re.UNICODE)
    tweet = emoji.sub(r'', tweet)

    #remove punctuation
    punctuation = '''!\(\)-\[]\{};:'"\,<>./?@#$%^&*_~=+…''' + u'\u060C' + u'\u061B' + u'\u061F'
    for c in tweet:
        if c in punctuation:
            tweet = tweet.replace(c, '')

    #remove laughter
    laughter = re.compile(r'\b(ه|خ)\1{2,}\b')
    tweet = laughter.sub('', tweet)

    #remove nonsense
    nonsense = re.compile(r'\b(.)\1{2,}\b')
    tweet = nonsense.sub(r'', tweet)

    #normalize elongated words
    repeated_characters = re.compile(r'(.)\1{2,}')
    tweet = repeated_characters.sub(r'\1', tweet)

    return tweet

def ArabicToArabizi(tweet):
    arabicLetterList = ['ض','ص','ث','ق' ,'ف' ,
                                  'غ' ,'ع' ,'ه','خ' ,'ح' ,
                                  'ج' ,'د' ,'ش' ,'س' ,'ي' ,
                                  'ن','ت' ,'ا' ,'ل','ب',
                                  'ء','ئ','ط' ,'ك','م',
                                  'ؤ','ر','لا','ى','ة',
                                  'و','ز' ,'ظ','لأ','أ' ,
                             'إ', 'آ', ' ','ذ']
    
    arabiziMappingList = [['d'],['s'],['th'],[ '2'],['f'],
                      ['8'], [ '3'], ['h'], ['5'],['7'],
                      ['j'], ['d'], ['sh'], ['s'], ['y'],
                      ['n'], ['t'], ['a'], ['l'], ['b'],
                      ['2'],['2'], ['t'], ['k'], ['m'],
                      ['o2'], ['r'], ['la'], ['a'], ['a'],
                      ['o'], ['z'], ['z'], ['la2'], ['2'],
                      ['2'],['2'], [' '], ['z']]
    arabiziSentence =''
    arabicToArabiziMapping = dict()
    for x in range(0, len(arabicLetterList)):
        arabicToArabiziMapping[arabicLetterList[x]] = arabiziMappingList[x] 
    for x in range(0, len(tweet)):
        if tweet[x] in arabicToArabiziMapping.keys():
            if tweet[x] ==  'و':
                if x == 0:
                    arabiziSentence = arabiziSentence + 'w'
                elif tweet[x-1] ==' ':
                    arabiziSentence = arabiziSentence + 'w'
                else:
                    arabiziSentence = arabiziSentence + 'o'
            else :
                arabiziSentence = arabiziSentence + arabicToArabiziMapping[tweet[x]][random.randint(0,
                                                                 len(arabicToArabiziMapping[tweet[x]])-1)]
    arabiziSentence = arabiziSentence.replace( 'alozyr', 'alwazer')
    arabiziSentence = arabiziSentence.replace( 'hza', 'hayda')
    arabiziSentence = arabiziSentence.replace( 'hzh', 'hayde')
    arabiziSentence = arabiziSentence.replace( ' lao ', ' law ')
    arabiziSentence = arabiziSentence.replace( ' ho ', ' huwe ')
    arabiziSentence = arabiziSentence.replace( 'jbran ', ' gebran ')
    arabiziSentence = arabiziSentence.replace( 'qtr ', ' qatar ')
    arabiziSentence = arabiziSentence.replace( 'wlyd ', ' walid ')
    arabiziSentence = arabiziSentence.replace( 'al7ryry ', ' al hariri ')
    return arabiziSentence

In [0]:
url = 'https://raw.githubusercontent.com/Hala-Mulki/L-HSAB-First-Arabic-Levantine-HateSpeech-Dataset/master/Dataset/L-HSAB'
lhsab = pd.read_csv(url, sep='\t')

newArabiziDataset = dict()
arabiziTweets = []
arabiziClasses = []
count = 0
for tweet in lhsab['Tweet']:
    newTweet = ArabicToArabizi(clean_text(tweet))
    arabiziTweets.append(newTweet)
    arabiziClasses.append(lhsab['Class'][count])
    count = count + 1

newArabiziDataset['Tweet'] = arabiziTweets
newArabiziDataset['Class'] = arabiziClasses
data = pd.DataFrame(newArabiziDataset, columns= ['Tweet', 'Class'])

In [0]:
# ! git clone https://github.com/NVIDIA/apex
# ! cd apex
# ! pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" /kaggle/working/apex/

In [0]:
!pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl 


Collecting torch==0.3.0.post4
[?25l  Downloading http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl (592.3MB)
[K     |████████████████████████████████| 592.3MB 1.2MB/s 
[31mERROR: torchvision 0.5.0 has requirement torch==1.4.0, but you'll have torch 0.3.0.post4 which is incompatible.[0m
[31mERROR: fastai 1.0.60 has requirement torch>=1.0.0, but you'll have torch 0.3.0.post4 which is incompatible.[0m
Installing collected packages: torch
  Found existing installation: torch 1.4.0
    Uninstalling torch-1.4.0:
      Successfully uninstalled torch-1.4.0
Successfully installed torch-0.3.0.post4


In [0]:
!pip install torchvision

Collecting torch==1.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/24/19/4804aea17cd136f1705a5e98a00618cb8f6ccc375ad8bfa437408e09d058/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 21kB/s 
Installing collected packages: torch
  Found existing installation: torch 0.3.0.post4
    Uninstalling torch-0.3.0.post4:
      Successfully uninstalled torch-0.3.0.post4
Successfully installed torch-1.4.0


In [0]:
# !pip install http://download.pytorch.org/whl/torch-0.2.0.post1-cp27-none-macosx_10_7_x86_64.whl 

In [0]:
use_cuda=True
import torch
if use_cuda and torch.cuda.is_available():
  # net.cuda()
  torch.cuda.set_device(0)

In [0]:
!pip install simpletransformers


Collecting simpletransformers
[?25l  Downloading https://files.pythonhosted.org/packages/36/fe/63b7df08f0412e3462332145009ddb303903fc3c6a15ec511c365c4c5bce/simpletransformers-0.26.0-py3-none-any.whl (159kB)
[K     |██                              | 10kB 24.3MB/s eta 0:00:01[K     |████▏                           | 20kB 28.3MB/s eta 0:00:01[K     |██████▏                         | 30kB 30.3MB/s eta 0:00:01[K     |████████▎                       | 40kB 34.0MB/s eta 0:00:01[K     |██████████▎                     | 51kB 34.8MB/s eta 0:00:01[K     |████████████▍                   | 61kB 36.3MB/s eta 0:00:01[K     |██████████████▍                 | 71kB 36.6MB/s eta 0:00:01[K     |████████████████▌               | 81kB 37.0MB/s eta 0:00:01[K     |██████████████████▌             | 92kB 37.9MB/s eta 0:00:01[K     |████████████████████▋           | 102kB 38.9MB/s eta 0:00:01[K     |██████████████████████▋         | 112kB 38.9MB/s eta 0:00:01[K     |█████████████████████

In [0]:
from simpletransformers.classification import ClassificationModel


In [0]:
newDataSet=data
train_df=newDataSet[:]

In [0]:
train_df.head()

Unnamed: 0,Tweet,Class
0,alwazer gebran basyl taj rask ya jrban mmno3 ...,abusive
1,sdy2y ant abn jam3h all3bh akbr mn da3sh all3b...,normal
2,w msl7a lbnan tbd2 bast5raj alnft w al8az lo2f...,normal
3,walid jnblat katb al7kma ya 2zr,abusive
4,sho btlb2lk klma 5nzyr btjy mfslh 3la 2yask ws...,abusive


In [0]:

# unique categories
print(train_df.Class.unique())
print("Total categories",len(train_df.Class.unique()))


['abusive' 'normal' 'hate']
Total categories 3


In [0]:
# convert string labels to integers

train_df['labels'] = pd.factorize(train_df.Class)[0]

train_df.head()
# train_df = train_df.drop('Class', axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Tweet,Class,labels
0,alwazer gebran basyl taj rask ya jrban mmno3 ...,abusive,0
1,sdy2y ant abn jam3h all3bh akbr mn da3sh all3b...,normal,1
2,w msl7a lbnan tbd2 bast5raj alnft w al8az lo2f...,normal,1
3,walid jnblat katb al7kma ya 2zr,abusive,0
4,sho btlb2lk klma 5nzyr btjy mfslh 3la 2yask ws...,abusive,0


In [0]:
train_df.head()

Unnamed: 0,Tweet,Class,labels
0,alwazer gebran basyl taj rask ya jrban mmno3 ...,abusive,0
1,sdy2y ant abn jam3h all3bh akbr mn da3sh all3b...,normal,1
2,w msl7a lbnan tbd2 bast5raj alnft w al8az lo2f...,normal,1
3,walid jnblat katb al7kma ya 2zr,abusive,0
4,sho btlb2lk klma 5nzyr btjy mfslh 3la 2yask ws...,abusive,0


In [0]:
train_df = train_df.drop('Class', axis=1)


In [0]:
# Let's create a train and test set
from sklearn.model_selection import train_test_split


train, test = train_test_split(train_df, test_size=0.2, random_state=42,stratify=train_df['labels'])

In [0]:
train.shape, test.shape


((4676, 2), (1170, 2))

In [0]:
print(df_train['labels'])
x=0
y=0
z=0
for i in df_train['labels']:
  if i==0:
    x+=1
  elif i==1:
    y+=1
  elif i==2:
    z+=1

3484    0
4145    0
1307    1
5000    0
1318    1
       ..
193     0
78      0
4403    1
3117    2
970     0
Name: labels, Length: 4676, dtype: int64


In [0]:
print(x)
print(y)
print(z)
print(x+y+z)

2920
1382
374
4676


In [0]:
# Lets define the model with the parameters (important here is the number of labels and nr of epochs)
#,weight=[2.07,1, 7.95]
model = ClassificationModel('bert', 'bert-base-multilingual-uncased', num_labels=3,weight=[1,2.11, 7.81], args={'reprocess_input_data': True, 'overwrite_output_dir': True, 'num_train_epochs': 10},use_cuda=True)

HBox(children=(IntProgress(value=0, description='Downloading', max=625, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=672271273, style=ProgressStyle(description_…




HBox(children=(IntProgress(value=0, description='Downloading', max=871891, style=ProgressStyle(description_wid…




In [0]:
help(ClassificationModel)

In [0]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [0]:
%%writefile setup.sh

export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

Writing setup.sh


In [0]:
!sh setup.sh


Cloning into 'apex'...
remote: Enumerating objects: 4, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6593 (delta 0), reused 0 (delta 0), pack-reused 6589[K
Receiving objects: 100% (6593/6593), 13.71 MiB | 22.56 MiB/s, done.
Resolving deltas: 100% (4384/4384), done.
  cmdoptions.check_install_build_global(options)
Created temporary directory: /tmp/pip-ephem-wheel-cache-w_of0oc0
Created temporary directory: /tmp/pip-req-tracker-jq6dc_g6
Created requirements tracker '/tmp/pip-req-tracker-jq6dc_g6'
Created temporary directory: /tmp/pip-install-9aip3_r9
Processing ./apex
  Created temporary directory: /tmp/pip-req-build-d_xptsyr
  Added file:///content/apex to build tracker '/tmp/pip-req-tracker-jq6dc_g6'
    Running setup.py (path:/tmp/pip-req-build-d_xptsyr/setup.py) egg_info for package from file:///content/apex
    Running command python setup.py egg_info
    torch.__version__  =  1.4.0
    running egg_info
    c

In [0]:
# Now lets fine tune bert with the train set
model.train_model(df_train)


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(IntProgress(value=0, max=4676), HTML(value='')))


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(IntProgress(value=0, description='Epoch', max=10, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='Current iteration', max=585, style=ProgressStyle(description_…

Running loss: 1.078962



Running loss: 1.126312



Running loss: 1.178848Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Running loss: 1.022369Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Running loss: 1.151309Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Running loss: 1.345507


HBox(children=(IntProgress(value=0, description='Current iteration', max=585, style=ProgressStyle(description_…

Running loss: 1.309986


HBox(children=(IntProgress(value=0, description='Current iteration', max=585, style=ProgressStyle(description_…

Running loss: 1.045016


HBox(children=(IntProgress(value=0, description='Current iteration', max=585, style=ProgressStyle(description_…

Running loss: 0.967643Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Running loss: 0.792973


HBox(children=(IntProgress(value=0, description='Current iteration', max=585, style=ProgressStyle(description_…

Running loss: 1.119496Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
Running loss: 0.274427


HBox(children=(IntProgress(value=0, description='Current iteration', max=585, style=ProgressStyle(description_…

Running loss: 1.283328


HBox(children=(IntProgress(value=0, description='Current iteration', max=585, style=ProgressStyle(description_…

Running loss: 1.284270Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2048.0
Running loss: 0.372649


HBox(children=(IntProgress(value=0, description='Current iteration', max=585, style=ProgressStyle(description_…

Running loss: 1.788269


HBox(children=(IntProgress(value=0, description='Current iteration', max=585, style=ProgressStyle(description_…

Running loss: 0.116696


HBox(children=(IntProgress(value=0, description='Current iteration', max=585, style=ProgressStyle(description_…

Running loss: 1.348869Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2048.0
Running loss: 0.072950



In [0]:
print(t mmest)skskmskmddddddkndknd

                                                  Tweet  labels
3075                                 ys3d sba7k ya rys        1
800                       hayde hy 25la2 alkyny al72y2y       1
383                           22tr7 3lyk an tsdo lbozk        0
3761                                       br8ot 2oatjy       2
1036                                   kol hoa a7la shy       0
...                                                 ...     ...
30    hyda 7sab f5ama alr2ys myshal 3on 2m 7sab f5am...       1
4920  kol hoa wanz7 la wra hydol mbt3that 3 7sab ald...       0
3981  ant lo btsd ny3k sho bkon a7snlk ya lbnany ba ...       2
78                                   yr7mha wysbr ahlha       1
465   hayda ykrh als3odya wdaym ysbha bs 7alya y7aol...       1

[1170 rows x 2 columns]


In [0]:
# from sklearn.metrics import accuracy_score

result, model_outputs, wrong_predictions = model.eval_model(df)


  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(IntProgress(value=0, max=1170), HTML(value='')))




HBox(children=(IntProgress(value=0, max=147), HTML(value='')))




In [0]:
print(result)

{'mcc': 0.49160607961527053, 'eval_loss': 1.614432852297109}


In [0]:
predictions = model_outputs.argmax(axis=1)


In [0]:
print(predictions.shape)
predictions

(1170,)


array([0, 0, 0, ..., 0, 1, 1])

In [0]:
predictions[:10]

array([0, 0, 0, 0, 1, 0, 0, 1, 2, 0])

In [0]:
actuals = df.labels.values
actuals[0:10]

array([0, 0, 0, 0, 0, 0, 1, 1, 0, 2])

In [0]:
for i in df['Tweet']:
  print (i)

akyd msh fadya bdk tkby sm byn al3alm
zmn altb3ya walt3tylytjsd fy syasa  gebran basyl
b2s zmn t2ym fyh 8ada 3yd  gebran basyl
2olkm  gebran basyl by2ra alt3ly2at ally btnzl mtl alshty 3lyh kl ma at7fna b2syda 3n sorya soal wjyh
ya 3m allh ya5dk
tyb mashy fy shy 8yro almrj3 alkbyr 2dysh kbyr y3ny fyk t7jm 
bt2sod mmarsat mthl ma 2al al7mar wzyr al5arjya  gebran basyl wallh bhdl 7alo wbhda lbnan 3la al
ya hbylh bt3rf 7alk mny7 btl hbl 
7koma al7mdyn w2df lhm wzyr 5arjya lbnan  gebran basyl klhm kzabyn
znjy 72yr y7sb nzam al3sabat yntb2 3la kl shy2
aljhabza ally 3m ytalbo bt2dym d3oa whyda msh a5bar ya btary2 3thman
lo btraj3 m2ablatk tsry7atk a2oalk ktabatk tra ank 2kthr ansan tthm alkl thajm alkl wtshtm alkl 7sb alt2  
s3b tshr7 loa7d lysh f3lya lazm yhtm bba2y alnas wykon 3ndh w who tys
lbrnamj 7oar alyom kant klma alwazer  gebran basyl wad7a wd2y2a balnsba lmodo3 alnzo7 alsory wntmna 3la aldol al3rbya mshark
tl7s ma a7mrk 
llasf al72y2a tjr7 ys3d sba7k 
ya tra mn mn alnzam al3rbya al

In [30]:
for i in predictions:
  if i==0:
    print("normal")
  elif i==1:
    print("abusive")
  elif i==2:
    print("hate")

normal
normal
normal
normal
abusive
normal
normal
abusive
hate
normal
abusive
normal
normal
normal
normal
normal
normal
abusive
abusive
normal
abusive
abusive
abusive
normal
normal
abusive
normal
abusive
normal
abusive
abusive
abusive
normal
normal
normal
abusive
abusive
normal
abusive
normal
abusive
normal
normal
normal
normal
abusive
normal
abusive
abusive
normal
normal
abusive
normal
abusive
normal
normal
abusive
abusive
abusive
normal
normal
abusive
abusive
normal
normal
normal
normal
normal
abusive
abusive
abusive
abusive
normal
normal
normal
normal
normal
normal
abusive
normal
normal
normal
normal
abusive
normal
normal
normal
normal
hate
normal
hate
normal
normal
abusive
abusive
normal
abusive
normal
normal
normal
normal
abusive
normal
normal
abusive
normal
normal
abusive
abusive
abusive
normal
normal
abusive
normal
normal
normal
normal
normal
abusive
normal
normal
normal
normal
hate
abusive
normal
normal
abusive
normal
normal
abusive
normal
abusive
normal
abusive
abusive
abusive

In [0]:
from sklearn.metrics import accuracy_score
accuracy_score(actuals, predictions)

0.7316239316239316

In [0]:
from sklearn.metrics import classification_report

print(classification_report(actuals, predictions))

              precision    recall  f1-score   support

           0       0.84      0.81      0.82       730
           1       0.59      0.70      0.64       346
           2       0.40      0.22      0.29        94

    accuracy                           0.73      1170
   macro avg       0.61      0.58      0.58      1170
weighted avg       0.73      0.73      0.73      1170



In [0]:
sample_text = test.iloc[5]['Tweet']
print(sample_text)

 gebran basyl zahra sotya yom b2ol sorya a7tlal w yom bdh yshark b23ada a3marha 5tab toyl 3ryd bd3m 3oda sorya lljam3a


In [0]:
# Lets predict the text of sample_text:
model.predict(sample_text)

HBox(children=(IntProgress(value=0, max=118), HTML(value='')))




HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1]),
 array([[-0.90771484,  2.8359375 , -2.3007812 ],
        [-1.0917969 ,  2.9785156 , -2.2832031 ],
        [-0.8989258 ,  2.8964844 , -2.4355469 ],
        [-0.8120117 ,  2.8066406 , -2.4453125 ],
        [-0.9350586 ,  2.9199219 , -2.3984375 ],
        [-0.83740234,  2.8710938 , -2.4550781 ],
        [-0.88623047,  2.9003906 , -2.4316406 ],
        [-0.90771484,  2.8359375 , -2.3007812 ],
        [-0.8120117 ,  2.8066406 , -2.4453125 ],
        [-0.83740234,  2.8710938 , -2.4550781 ],
        [-0.42895508,  2.6855469 , -2.7246094 ],
        [-1.0634766 ,  2.9609375 , -2.3027344 ],
        

In [0]:
# Lets see what the truth was
test.iloc[:]['labels']

5688    1
3305    1
2318    0
5526    1
681     1
       ..
3259    1
5714    1
3375    0
113     2
4958    1
Name: labels, Length: 1170, dtype: int64

In [0]:
# And this was category: 
# test.iloc[797]['Class']

In [0]:
x=7

In [0]:
sample_text = test.iloc[x]['Tweet']
print(sample_text)
# Lets predict the text of sample_text:
model.predict([sample_text])


anjaz jdyd ll3hd gebran basyl


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




(array([1]), array([[-0.69433594,  2.8105469 , -2.5722656 ]], dtype=float32))

In [0]:
# Lets see what the truth was
test.iloc[x]['labels']


1

In [0]:
model.save_model('bertmodel.h5')

AttributeError: ignored

In [0]:
import numpy as np
import pandas as pd
import re
import random
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, Model
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dropout, Dense, Conv2D, MaxPooling2D, Reshape, Lambda, Permute
from tensorflow.keras.backend import clear_session
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

def clean_text(tweet):
    #remove emoji
    emoji = re.compile("["
                        u'\U0001F600-\U0001F64F'  # emoticons
                        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
                        u'\U0001F680-\U0001F6FF'  # transport & map symbols
                        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
                        u'\U00002702-\U000027B0'
                        u'\U000024C2-\U0001F251'
                        "]+", flags=re.UNICODE)
    tweet = emoji.sub(r'', tweet)

    #remove punctuation
    punctuation = '''!\(\)-\[]\{};:'"\,<>./?@#$%^&*_~=+…''' + u'\u060C' + u'\u061B' + u'\u061F'
    for c in tweet:
        if c in punctuation:
            tweet = tweet.replace(c, '')

    #remove laughter
    laughter = re.compile(r'\b(ه|خ)\1{2,}\b')
    tweet = laughter.sub('', tweet)

    #remove nonsense
    nonsense = re.compile(r'\b(.)\1{2,}\b')
    tweet = nonsense.sub(r'', tweet)

    #normalize elongated words
    repeated_characters = re.compile(r'(.)\1{2,}')
    tweet = repeated_characters.sub(r'\1', tweet)

    return tweet

def ArabicToArabizi(tweet):
    arabicLetterList = ['ض','ص','ث','ق' ,'ف' ,
                                  'غ' ,'ع' ,'ه','خ' ,'ح' ,
                                  'ج' ,'د' ,'ش' ,'س' ,'ي' ,
                                  'ن','ت' ,'ا' ,'ل','ب',
                                  'ء','ئ','ط' ,'ك','م',
                                  'ؤ','ر','لا','ى','ة',
                                  'و','ز' ,'ظ','لأ','أ' ,
                             'إ', 'آ', ' ','ذ']
    
    arabiziMappingList = [['d'],['s'],['th'],[ '2'],['f'],
                      ['8'], [ '3'], ['h'], ['5'],['7'],
                      ['j'], ['d'], ['sh'], ['s'], ['y'],
                      ['n'], ['t'], ['a'], ['l'], ['b'],
                      ['2'],['2'], ['t'], ['k'], ['m'],
                      ['o2'], ['r'], ['la'], ['a'], ['a'],
                      ['o'], ['z'], ['z'], ['la2'], ['2'],
                      ['2'],['2'], [' '], ['z']]
    arabiziSentence =''
    arabicToArabiziMapping = dict()
    for x in range(0, len(arabicLetterList)):
        arabicToArabiziMapping[arabicLetterList[x]] = arabiziMappingList[x] 
    for x in range(0, len(tweet)):
        if tweet[x] in arabicToArabiziMapping.keys():
            if tweet[x] ==  'و':
                if x == 0:
                    arabiziSentence = arabiziSentence + 'w'
                elif tweet[x-1] ==' ':
                    arabiziSentence = arabiziSentence + 'w'
                else:
                    arabiziSentence = arabiziSentence + 'o'
            else :
                arabiziSentence = arabiziSentence + arabicToArabiziMapping[tweet[x]][random.randint(0,
                                                                 len(arabicToArabiziMapping[tweet[x]])-1)]
    arabiziSentence = arabiziSentence.replace( 'alozyr', 'alwazer')
    arabiziSentence = arabiziSentence.replace( 'hza', 'hayda')
    arabiziSentence = arabiziSentence.replace( 'hzh', 'hayde')
    arabiziSentence = arabiziSentence.replace( ' lao ', ' law ')
    arabiziSentence = arabiziSentence.replace( ' ho ', ' huwe ')
    arabiziSentence = arabiziSentence.replace( 'jbran ', ' gebran ')
    arabiziSentence = arabiziSentence.replace( 'qtr ', ' qatar ')
    arabiziSentence = arabiziSentence.replace( 'wlyd ', ' walid ')
    arabiziSentence = arabiziSentence.replace( 'al7ryry ', ' al hariri ')
    return arabiziSentence

In [0]:
url = 'https://raw.githubusercontent.com/Hala-Mulki/L-HSAB-First-Arabic-Levantine-HateSpeech-Dataset/master/Dataset/L-HSAB'
lhsab = pd.read_csv(url, sep='\t')

newArabiziDataset = dict()
arabiziTweets = []
arabiziClasses = []
count = 0
for tweet in lhsab['Tweet']:
    newTweet = ArabicToArabizi(clean_text(tweet))
    arabiziTweets.append(newTweet)
    arabiziClasses.append(lhsab['Class'][count])
    count = count + 1

newArabiziDataset['Tweet'] = arabiziTweets
newArabiziDataset['Class'] = arabiziClasses
data = pd.DataFrame(newArabiziDataset, columns= ['Tweet', 'Class'])

MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 280

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
tokenizer.fit_on_texts(data['Tweet'].values)
VOCAB_SIZE = len(tokenizer.word_index) + 1
word_list = list(tokenizer.word_index.keys())

X = tokenizer.texts_to_sequences(data['Tweet'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

y = pd.get_dummies(data['Class'].values)
label_names = y.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y.values)

In [0]:
df = data.iloc[y_test.index]

In [0]:
df_train=data.iloc[y_train.index]

In [0]:
print(df_train)

                                                  Tweet    Class
3484  ho ma 2al asra2yl lha 72 bal2man myn 3m ytrjml...   normal
4145                                             2dyma    normal
1307     a7snt 7sarh balzaoya kf2r jrban 3zra mn alf2r   abusive
5000  3ml toyt 3m ys2l 3n jdoa shytna  gebran basyl ...   normal
1318                              w sayr mhzlh halm8rd   abusive
...                                                 ...      ...
193   anh ant bt7ky lt7kymyn jab syra alhbhkl alsh3b...   normal
78                                   yr7mha wysbr ahlha   normal
4403            m3aly alwazer a3ml blok lhayda al7yoan   abusive
3117                        tyar almsthbl y7kmh al7myry     hate
970                 wd7 myn hny alshrkat wmyn 2s7abha     normal

[4676 rows x 2 columns]


In [0]:
df_train['labels'] = pd.factorize(df_train.Class)[0]
df_train = df_train.drop('Class', axis=1)
df_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Tweet,labels
3484,ho ma 2al asra2yl lha 72 bal2man myn 3m ytrjml...,0
4145,2dyma,0
1307,a7snt 7sarh balzaoya kf2r jrban 3zra mn alf2r,1
5000,3ml toyt 3m ys2l 3n jdoa shytna gebran basyl ...,0
1318,w sayr mhzlh halm8rd,1


In [0]:
print(df.Class)

1918     normal
5802     normal
4412     normal
1228     normal
4058     normal
         ...   
1283     normal
4192     normal
687      normal
1441    abusive
4197       hate
Name: Class, Length: 1170, dtype: object


In [0]:


df['labels1'] = pd.factorize(df.Class)[0]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
help(pd.factorize)

In [0]:
df['labels']=

In [0]:
print(df)


                                                  Tweet    Class
1918              akyd msh fadya bdk tkby sm byn al3alm   normal
5802   zmn altb3ya walt3tylytjsd fy syasa  gebran basyl   normal
4412            b2s zmn t2ym fyh 8ada 3yd  gebran basyl   normal
1228  2olkm  gebran basyl by2ra alt3ly2at ally btnzl...   normal
4058                                   ya 3m allh ya5dk   normal
...                                                 ...      ...
1283  w 7drtk trddy nfs al2d3a2at alnashtat alm3t2la...   normal
4192  25 ya  gebran basyl sho 2ahr 3alm ant m3lm mna...   normal
687   5ryta btod7 toz3 al2oa3d al3skrya al2jnbya dmn...   normal
1441  2d7 wjm tfh 3lyk w3la brnamjk w3la alm7ta ly 3...  abusive
4197                  ashm ry7t zbalh ymkn sny 72yr hna     hate

[1170 rows x 2 columns]


In [0]:
print(df['labels'])

1918    0
5802    0
4412    0
1228    0
4058    0
       ..
1283    0
4192    0
687     0
1441    1
4197    2
Name: labels, Length: 1170, dtype: int64


In [0]:
myarray=[]
for label in df['labels1']:
  if label==0:
    myarray.append(1)
  elif label==1:
    myarray.append(0)
  elif label==2:
    myarray.append(2)


In [0]:
print(myarray)

[1, 1, 1, 1, 1, 1, 0, 0, 1, 2, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 2, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 2, 1, 1, 1, 1, 1, 1, 0, 1, 2, 2, 1, 0, 1, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 2, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 2, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 0, 2, 0, 2, 2, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 2, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 2, 0, 1, 1, 0, 1, 1, 1, 2, 0, 2, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 2, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 2, 1, 1, 2, 0, 0, 1, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 0, 0, 2, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 

In [0]:
df['labels']=myarray

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [0]:
print(df)

                                                  Tweet  labels
1918              akyd msh fadya bdk tkby sm byn al3alm       1
5802   zmn altb3ya walt3tylytjsd fy syasa  gebran basyl       1
4412            b2s zmn t2ym fyh 8ada 3yd  gebran basyl       1
1228  2olkm  gebran basyl by2ra alt3ly2at ally btnzl...       1
4058                                   ya 3m allh ya5dk       1
...                                                 ...     ...
1283  w 7drtk trddy nfs al2d3a2at alnashtat alm3t2la...       1
4192  25 ya  gebran basyl sho 2ahr 3alm ant m3lm mna...       1
687   5ryta btod7 toz3 al2oa3d al3skrya al2jnbya dmn...       1
1441  2d7 wjm tfh 3lyk w3la brnamjk w3la alm7ta ly 3...       0
4197                  ashm ry7t zbalh ymkn sny 72yr hna       2

[1170 rows x 2 columns]


In [0]:
df = df.drop('labels1', axis=1)

In [0]:
print(df)

                                                  Tweet  labels
1918              akyd msh fadya bdk tkby sm byn al3alm       0
5802   zmn altb3ya walt3tylytjsd fy syasa  gebran basyl       0
4412            b2s zmn t2ym fyh 8ada 3yd  gebran basyl       0
1228  2olkm  gebran basyl by2ra alt3ly2at ally btnzl...       0
4058                                   ya 3m allh ya5dk       0
...                                                 ...     ...
1283  w 7drtk trddy nfs al2d3a2at alnashtat alm3t2la...       0
4192  25 ya  gebran basyl sho 2ahr 3alm ant m3lm mna...       0
687   5ryta btod7 toz3 al2oa3d al3skrya al2jnbya dmn...       0
1441  2d7 wjm tfh 3lyk w3la brnamjk w3la alm7ta ly 3...       1
4197                  ashm ry7t zbalh ymkn sny 72yr hna       2

[1170 rows x 2 columns]


In [0]:

df['labels'] = pd.factorize(df.Class)[0]
df = df.drop('Class', axis=1)
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Tweet,labels
1918,akyd msh fadya bdk tkby sm byn al3alm,0
5802,zmn altb3ya walt3tylytjsd fy syasa gebran basyl,0
4412,b2s zmn t2ym fyh 8ada 3yd gebran basyl,0
1228,2olkm gebran basyl by2ra alt3ly2at ally btnzl...,0
4058,ya 3m allh ya5dk,0


In [0]:
df.head(:15)

Unnamed: 0,Tweet,labels
1918,akyd msh fadya bdk tkby sm byn al3alm,0
5802,zmn altb3ya walt3tylytjsd fy syasa gebran basyl,0
4412,b2s zmn t2ym fyh 8ada 3yd gebran basyl,0
1228,2olkm gebran basyl by2ra alt3ly2at ally btnzl...,0
4058,ya 3m allh ya5dk,0
3986,tyb mashy fy shy 8yro almrj3 alkbyr 2dysh kbyr...,0
3102,bt2sod mmarsat mthl ma 2al al7mar wzyr al5arjy...,1
4062,ya hbylh bt3rf 7alk mny7 btl hbl,1
4414,7koma al7mdyn w2df lhm wzyr 5arjya lbnan gebr...,0
5560,znjy 72yr y7sb nzam al3sabat yntb2 3la kl shy2,2
