In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import re

In [2]:
train_pos=pd.read_csv('train_Arabic_tweets_positive_20190413.tsv',delimiter='\t',names=['label','tweet'])
train_neg=pd.read_csv('train_Arabic_tweets_negative_20190413.tsv',delimiter='\t',names=['label','tweet'])

test_pos=pd.read_csv('test_Arabic_tweets_positive_20190413.tsv',delimiter='\t',names=['label','tweet'])
test_neg=pd.read_csv('test_Arabic_tweets_negative_20190413.tsv',delimiter='\t',names=['label','tweet'])

In [4]:
import nltk

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
stopwords_list = stopwords.words('arabic')
st = ISRIStemmer()
stop=[]
for w in stopwords_list:
    rootWord=st.stem(w)
    stop.append(rootWord)

In [7]:
train=pd.concat([train_neg,train_pos],axis=0,ignore_index=True)

test=pd.concat([test_neg,test_pos],axis=0)
s=train['tweet'].str.len()
train_neg=[]
train_pos=[]
test_pos=[]
test_neg=[]

In [8]:
def process_text(text):
    stemmer = nltk.ISRIStemmer()
    word_list = nltk.word_tokenize(text)
    #remove arabic stopwords
    word_list = [ w for w in word_list if not w in stopwords_list ]
    #remove digits
    word_list = [ w for w in word_list ]
    #stemming
    word_list = [stemmer.stem(w) for w in  word_list]
    return ' '.join(word_list) 


def clean_text(text):  

    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى",
              "\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا",
               "","","","ي","",' ', ' ',' ',' ? ',' ؟ ', ' ! ']
    #remove tashkeel
    tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(tashkeel,"", text)
  
    longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(longation, subst, text)
    
    text = re.sub(r"[^\w\s]", '', text)
    #remove english words
    text = re.sub(r"[a-zA-Z]", '', text)
    #remove spaces
    text = re.sub(r"\d+", ' ', text)
    text = re.sub(r"\n+", ' ', text)
    text = re.sub(r"\t+", ' ', text)
    text = re.sub(r"\r+", ' ', text)
    text = re.sub(r"\s+", ' ', text)
    #remove repetetions
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    text = text.strip()
    
    return process_text(text) 

In [10]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [11]:
train['tweet2']=train.tweet.apply(clean_text)
test['tweet2']=test.tweet.apply(clean_text)

In [12]:
train

Unnamed: 0,label,tweet,tweet2
0,neg,اعترف ان بتس كانو شوي شوي يجيبو راسي لكن اليوم...,عرف ان بتس كنو شوي شوي جيبو رسي اليوم زيد
1,neg,توقعت اذا جات داريا بشوفهم كاملين بس لي للحين ...,وقع اذا جات دار بشف كمل حين احس احد نقص
2,neg,#الاهلي_الهلال اكتب توقعك لنتيجة لقاء الهلال و...,اهل هلل كتب وقع نتج لقء هلل اهل تاق تحد سرع رو...
3,neg,نعمة المضادات الحيوية . تضع قطرة💧مضاد بنسلين ع...,نعم ضاد حيي تضع قطرهمضاد نسل علي كتر فجر تمو ا...
4,neg,الدودو جايه تكمل علي 💔,دودو جيه كمل علي
...,...,...,...
45270,pos,السحب الليلة على الايفون .. رتويت للمرفقة وطبق...,سحب ليل علي ايف رتي رفق طبق شرط
45271,pos,😂 لابسة احمر ليه يا ست انتي ايه المناسبة 😂,لبس حمر ليه يست انت ايه نسب
45272,pos,كلاام جمييل تستاهل(من احبه الله جعل محبته ف قل...,كلم جمل تستاهلمن احب الل حبت قلب بشر
45273,pos,- ألطف صورة ممكن تعبر عن رمضان 💙,لطف صور مكن عبر رمض


In [13]:
X_train=train.tweet
X_test=test.tweet
y_train=train['label']
y_test=test['label']
train=[]

X_train.drop_duplicates()

enc=LabelEncoder()
y_train=enc.fit_transform(y_train)
y_test=enc.transform(y_test)

In [17]:
from keras.models import Sequential
from keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer


tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

print("vocab size:",len(tokenizer.word_index))

X_train = pad_sequences(X_train, padding='post', maxlen=300)
X_test = pad_sequences(X_test, padding='post', maxlen=300)

vocab size: 78651


In [18]:
#create LSTM model with keras
embedding_dim = 100
dropout = 0.5
opt = 'adam'
model = Sequential()
model.add(layers.Embedding(input_dim=20000, 
                           output_dim=100, 
                           input_length=300))
model.add(layers.Bidirectional(layers.LSTM(100, dropout=0.5, 
                                           recurrent_dropout=0.5, 
                                           return_sequences=True)))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(dropout))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=opt, 
              loss='binary_crossentropy', 
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 100)          2000000   
                                                                 
 bidirectional (Bidirectiona  (None, 300, 200)         160800    
 l)                                                              
                                                                 
 global_max_pooling1d (Globa  (None, 200)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 128)               25728     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8

In [None]:
history = model.fit(X_train, y_train,
                    epochs=4,
                    verbose=True,
                    validation_freq=0.2,
                    batch_size=256)

Epoch 1/4
Epoch 2/4
 29/177 [===>..........................] - ETA: 1:03:59 - loss: 0.1701 - accuracy: 0.9440