# Table of content 
* Data Cleansing
  * Removing urls, emojis, and @ABCD.
  * Data Normalization
  * Removing (الهمزة والنقاط من فوق التاء المربوطة).
* Data Preprocessing 
  * Tokenization.
  * Tfidftransformer
* Modeling and training 
  * Define Models.
  * Training 
  * Evaluation 
  * Loading Model,and Transformer for future use.
  * Test model's predictability 


In [11]:
import pandas as pd 
import re
import pickle 
import numpy as np
from helper import text_normalize,tokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
print("All SET")

All SET


In [12]:
%pwd

'C:\\Users\\Eslam\\OneDrive\\Desktop\\AIM Technologies\\Machine learning'

In [13]:
df=pd.read_csv('C:\\Users\\Eslam\\OneDrive\\Desktop\\AIM Technologies\\Data Fetching\\Data Fetching\\dataset.csv',encoding='UTF-32')
print("Data imported")
df.head()

Data imported


Unnamed: 0,id,dialect,text
0,1175358310087892992,IQ,@Nw8ieJUwaCAAreT لكن بالنهاية .. ينتفض .. يغير .
1,1175416117793349632,IQ,@7zNqXP0yrODdRjK يعني هذا محسوب على البشر .. ح...
2,1175450108898565888,IQ,@KanaanRema مبين من كلامه خليجي
3,1175471073770573824,IQ,@HAIDER76128900 يسلملي مرورك وروحك الحلوه💐
4,1175496913145217024,IQ,@hmo2406 وين هل الغيبه اخ محمد 🌸🌺


# Data Cleansing
* Removing urls, emoji, and twitter usernames


In [14]:
def preprocessing(text):
    """
    The data preprocessing fuction takes string, extracts only arabic text out of it,
    removes الهمزة والتاء المربوطة
    Input >>> text
    Output >>> clean text
    """
    text=re.sub(r'[^\s\u0627-\u064a]','', text)
    text=re.sub(r'(.)\1+', r'\1', text)
    text=text_normalize(text)
    return text


In [15]:
df['text']=df['text'].apply(preprocessing)
df.head()

Unnamed: 0,id,dialect,text
0,1175358310087892992,IQ,لكن بالنهايه ينتفض يغير
1,1175416117793349632,IQ,يعني هذا محسوب علي البشر حيونه وحشيه وتطلبون ...
2,1175450108898565888,IQ,مبين من كلامه خليجي
3,1175471073770573824,IQ,يسلملي مرورك وروحك الحلوه
4,1175496913145217024,IQ,وين هل الغيبه اخ محمد


Sounds good the data is now clean and normalized

# Data Preprocessing

**Tfidftransformer**

In [None]:
Tfidf = TfidfVectorizer(tokenizer=tokenizer)
Tfidf_transformer = Tfidf.fit_transform(df['text'])

In [None]:
len(Tfidf.get_feature_names())

In [None]:
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=Tfidf_transformer[0]
# place tf-idf values in a pandas data frame 
dt = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=Tfidf.get_feature_names(), columns=["tfidf"])
print(df['text'][0])
dt[dt['tfidf']>0.0]

In [None]:
df['dialect'].unique()

In [None]:
dialect_mapping={'IQ':0, 'LY':1, 'QA':2, 'PL':3, 'SY':4, 'TN':5, 'JO':6, 'MA':7, 'SA':8, 'YE':9,
                 'DZ':10,'EG':11, 'LB':11, 'KW':13, 'OM':14, 'SD':15, 'AE':16, 'BH':17}
# reverse the dict to return the label as text not number 
y_id_to_word = {value: key for key, value in dialect_mapping.items()}

In [None]:
labels=df['dialect'].map(dialect_mapping)
labels

In [None]:
train_x, test_x, train_y, test_y = train_test_split(Tfidf_transformer, labels, test_size=0.3)

In [None]:
clf = MultinomialNB().fit(train_x, train_y)
y_score = clf.predict(test_x)

In [None]:

print(classification_report(test_y, y_score,target_names=['IQ', 'LY', 'QA', 'PL', 'SY', 'TN', 'JO', 'MA', 'SA', 'YE', 'DZ',
       'EG', 'LB', 'KW', 'OM', 'SD', 'AE', 'BH']))

In [None]:
lr=LogisticRegression(max_iter=10000)
lr.fit(train_x, train_y)
yhat=lr.predict(test_x)

In [None]:
print(classification_report(test_y, yhat,target_names=['IQ', 'LY', 'QA', 'PL', 'SY', 'TN', 'JO', 'MA', 'SA', 'YE', 'DZ',
       'EG', 'LB', 'KW', 'OM', 'SD', 'AE', 'BH']))

The logistic regression performance is way better than the Naive Bayes 

In [None]:
with open('logistic_regression.pickle','wb') as model:
    pickle.dump(lr,model)



In [3]:
#testing the performance 
model=pickle.load(open('./logistic_regression.pickle','rb'))
transformer=pickle.load(open('./Tfidf.pickle','rb'))





In [4]:
def predcitor(text):
    """
    this function takes text and returns the dialect
    it normalizes the entered text and use it to predict the dialect.
    """
    y_id_to_word={1:'IQ',2: 'LY',3: 'QA',4: 'PL',5: 'SY',6: 'TN',7: 'JO',8: 'MA',9: 'SA',10: 'YE',11: 'DZ',12: 'EG',13: 'LB',14: 'KW',15: 'OM',16: 'SD',17: 'AE',18: 'BH'}
    text=preprocessing(text)
    text=text.split(" ")
    return y_id_to_word[np.argmax(model.predict(transformer.transform(text)))]

In [None]:
pred=predcitor('صباح العسل')
pred

Good! 

In [10]:
np.argmax(model.predict(transformer.transform('ابو صلاح العالمي'.split())))

1