In [2]:
# import Tools
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load
from scipy.sparse import save_npz, load_npz
from sklearn.linear_model import SGDClassifier, LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from scipy.sparse import csc_matrix
from scipy.stats import uniform
from sklearn.ensemble import AdaBoostClassifier , RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

## Load & Preprocessing

In [4]:
# load Englih A Data 
engA = pd.read_csv('/content/English-A.txt', sep='\t', names=['id', 'label', 'Text', 'non'])
print(engA['label'].unique())
print(engA.info())
engA.head()

['neutral' 'negative' 'positive']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20632 entries, 0 to 20631
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20632 non-null  int64 
 1   label   20632 non-null  object
 2   Text    20632 non-null  object
 3   non     10 non-null     object
dtypes: int64(1), object(3)
memory usage: 644.9+ KB
None


Unnamed: 0,id,label,Text,non
0,619950566786113536,neutral,"Picturehouse's, Pink Floyd's, 'Roger Waters: T...",
1,619969366986235905,neutral,Order Go Set a Watchman in store or through ou...,
2,619971047195045888,negative,If these runway renovations at the airport pre...,
3,619974445185302528,neutral,If you could ask an onstage interview question...,
4,619987808317407232,positive,A portion of book sales from our Harper Lee/Go...,


In [5]:
# Convert Labels to int (0 for neutral, 1 for positive, -1 for negative)
for i in engA.index:
  if engA.loc[i, 'label'] == 'neutral':
    engA.loc[i, 'label'] = 0

  if engA.loc[i, 'label'] == 'positive':
    engA.loc[i, 'label'] = 1

  if engA.loc[i, 'label'] == 'negative':
    engA.loc[i, 'label'] = -1  

engA.head()

Unnamed: 0,id,label,Text,non
0,619950566786113536,0,"Picturehouse's, Pink Floyd's, 'Roger Waters: T...",
1,619969366986235905,0,Order Go Set a Watchman in store or through ou...,
2,619971047195045888,-1,If these runway renovations at the airport pre...,
3,619974445185302528,0,If you could ask an onstage interview question...,
4,619987808317407232,1,A portion of book sales from our Harper Lee/Go...,


In [6]:
# load Englih B Data 
engB = pd.read_csv('/content/English-B.txt', sep='\t', names=['id', 'Subject', 'label', 'Text', 'non'])
print(engB['label'].unique())
print(engB.info())
engB.head()

['negative' 'positive']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10551 entries, 0 to 10550
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       10551 non-null  int64  
 1   Subject  10551 non-null  object 
 2   label    10551 non-null  object 
 3   Text     10551 non-null  object 
 4   non      0 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 412.3+ KB
None


Unnamed: 0,id,Subject,label,Text,non
0,681563394940473347,amy schumer,negative,@MargaretsBelly Amy Schumer is the stereotypic...,
1,675847244747177984,amy schumer,negative,@dani_pitter I mean I get the hype around JLaw...,
2,672827854279843840,amy schumer,negative,Amy Schumer at the #GQmenoftheyear2015 party i...,
3,662755012129529858,amy schumer,negative,Amy Schumer is on Sky Atlantic doing one of th...,
4,671502639671042048,amy schumer,negative,"Amy Schumer may have brought us Trainwreck, bu...",


In [7]:
# Convert Labels to int (0 for neutral, 1 for positive, -1 for negative)
for i in engB.index:
  if engB.loc[i, 'label'] == 'neutral':
    engB.loc[i, 'label'] = 0

  if engB.loc[i, 'label'] == 'positive':
    engB.loc[i, 'label'] = 1

  if engB.loc[i, 'label'] == 'negative':
    engB.loc[i, 'label'] = -1  

engB.head()

Unnamed: 0,id,Subject,label,Text,non
0,681563394940473347,amy schumer,-1,@MargaretsBelly Amy Schumer is the stereotypic...,
1,675847244747177984,amy schumer,-1,@dani_pitter I mean I get the hype around JLaw...,
2,672827854279843840,amy schumer,-1,Amy Schumer at the #GQmenoftheyear2015 party i...,
3,662755012129529858,amy schumer,-1,Amy Schumer is on Sky Atlantic doing one of th...,
4,671502639671042048,amy schumer,-1,"Amy Schumer may have brought us Trainwreck, bu...",


In [8]:
# load Englih C Data
engC = pd.read_csv('/content/English-C.txt', sep='\t', names=['id', 'Subject', 'label', 'Text', 'non'])
print(engC['label'].unique())
print(engC.info())
engC.head()

[-1 -2  0  1  2]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20632 entries, 0 to 20631
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       20632 non-null  int64  
 1   Subject  20632 non-null  object 
 2   label    20632 non-null  int64  
 3   Text     20632 non-null  object 
 4   non      0 non-null      float64
dtypes: float64(1), int64(2), object(2)
memory usage: 806.1+ KB
None


Unnamed: 0,id,Subject,label,Text,non
0,681563394940473347,amy schumer,-1,@MargaretsBelly Amy Schumer is the stereotypic...,
1,675847244747177984,amy schumer,-1,@dani_pitter I mean I get the hype around JLaw...,
2,672827854279843840,amy schumer,-1,Amy Schumer at the #GQmenoftheyear2015 party i...,
3,671502639671042048,amy schumer,-1,"Amy Schumer may have brought us Trainwreck, bu...",
4,677359143108214784,amy schumer,-1,I just think that sports are stupid &amp;anyon...,


In [9]:
# Convert Labels to int (0 for neutral, 1 for positive, -1 for negative)
for i in engC.index:
  if engC.loc[i, 'label'] == 0:
    engC.loc[i, 'label'] = 0

  if engC.loc[i, 'label'] > 0:
    engC.loc[i, 'label'] = 1

  if engC.loc[i, 'label'] < 0:
    engC.loc[i, 'label'] = -1  

engC.head()

Unnamed: 0,id,Subject,label,Text,non
0,681563394940473347,amy schumer,-1,@MargaretsBelly Amy Schumer is the stereotypic...,
1,675847244747177984,amy schumer,-1,@dani_pitter I mean I get the hype around JLaw...,
2,672827854279843840,amy schumer,-1,Amy Schumer at the #GQmenoftheyear2015 party i...,
3,671502639671042048,amy schumer,-1,"Amy Schumer may have brought us Trainwreck, bu...",
4,677359143108214784,amy schumer,-1,I just think that sports are stupid &amp;anyon...,


In [10]:
# Load Arabic A Data
ArbA = pd.read_csv('/content/Arabic-A.txt', sep='\t', names=['id', 'label', 'Text'])
print(ArbA['label'].unique())
print(ArbA.info())
ArbA.head()

['positive' 'neutral' 'negative']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      671 non-null    int64 
 1   label   671 non-null    object
 2   Text    671 non-null    object
dtypes: int64(1), object(2)
memory usage: 15.9+ KB
None


Unnamed: 0,id,label,Text
0,781400143144726529,positive,إطلالة أنيقة وانثوية من مجموعة غوتشي لخريف/شتا...
1,781476002463711232,neutral,أخر صيحة من حقائب اليد من فندي وغوتشي https://...
2,781485165000396800,positive,تتشرف #باري_غاليري بدعوتكم لحضور ومقابلة #نجوى...
3,781542717218254848,neutral,RT @ArabRapMonster: [ستايل]بنطال #نامجون الجين...
4,781730607919460352,neutral,RT @wufan_styleee: 160930 كريس في The Amazing ...


In [11]:
# Convert Labels to int (0 for neutral, 1 for positive, -1 for negative)
for i in ArbA.index:
  if ArbA.loc[i, 'label'] == 'neutral':
    ArbA.loc[i, 'label'] = 0

  if ArbA.loc[i, 'label'] == 'positive':
    ArbA.loc[i, 'label'] = 1

  if ArbA.loc[i, 'label'] == 'negative':
    ArbA.loc[i, 'label'] = -1  

ArbA.head()

Unnamed: 0,id,label,Text
0,781400143144726529,1,إطلالة أنيقة وانثوية من مجموعة غوتشي لخريف/شتا...
1,781476002463711232,0,أخر صيحة من حقائب اليد من فندي وغوتشي https://...
2,781485165000396800,1,تتشرف #باري_غاليري بدعوتكم لحضور ومقابلة #نجوى...
3,781542717218254848,0,RT @ArabRapMonster: [ستايل]بنطال #نامجون الجين...
4,781730607919460352,0,RT @wufan_styleee: 160930 كريس في The Amazing ...


In [12]:
# load Arabic B Data
ArbB = pd.read_csv('/content/Arabic-B.txt', sep='\t', names=['id', 'Subject', 'label', 'Text'])
print(ArbB['label'].unique())
print(ArbB.info())
ArbB.head()

['negative' 'positive']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332 entries, 0 to 331
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       332 non-null    int64 
 1   Subject  332 non-null    object
 2   label    332 non-null    object
 3   Text     332 non-null    object
dtypes: int64(1), object(3)
memory usage: 10.5+ KB
None


Unnamed: 0,id,Subject,label,Text
0,784707690522181632,غوتشي,negative,ليون و واكا شكلهم عادي ساتوري يمكن تكون شخصيته...
1,784771284974600193,غوتشي,positive,قبل أن يسدل #أسبوع_الموضة ستاره، انتقت النجمة ...
2,784776165814829057,غوتشي,positive,شوفو كيف شكله كيوت اغوتشي غوتشي يااااا
3,784816868041326592,غوتشي,positive,RT @MarkaVIP: ادخل عالمًا من الأناقة والجودة ا...
4,784997849289416704,غوتشي,positive,“كيت بلانشيت” أول من ترتدي “غوتشي” 2017https:/...


In [13]:
# Convert Labels to int (0 for neutral, 1 for positive, -1 for negative)
for i in ArbB.index:
  if ArbB.loc[i, 'label'] == 'neutral':
    ArbB.loc[i, 'label'] = 0

  if ArbB.loc[i, 'label'] == 'positive':
    ArbB.loc[i, 'label'] = 1

  if ArbB.loc[i, 'label'] == 'negative':
    ArbB.loc[i, 'label'] = -1  

ArbB.head()

Unnamed: 0,id,Subject,label,Text
0,784707690522181632,غوتشي,-1,ليون و واكا شكلهم عادي ساتوري يمكن تكون شخصيته...
1,784771284974600193,غوتشي,1,قبل أن يسدل #أسبوع_الموضة ستاره، انتقت النجمة ...
2,784776165814829057,غوتشي,1,شوفو كيف شكله كيوت اغوتشي غوتشي يااااا
3,784816868041326592,غوتشي,1,RT @MarkaVIP: ادخل عالمًا من الأناقة والجودة ا...
4,784997849289416704,غوتشي,1,“كيت بلانشيت” أول من ترتدي “غوتشي” 2017https:/...


In [14]:
# Load Arabic C Data
ArbC = pd.read_csv('/content/Arabic-C.txt', sep='\t', names=['id', 'Subject', 'label', 'Text'])
print(ArbC['label'].unique())
print(ArbC.info())
ArbC.head()

[ 1  0 -1]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       671 non-null    int64 
 1   Subject  671 non-null    object
 2   label    671 non-null    int64 
 3   Text     671 non-null    object
dtypes: int64(2), object(2)
memory usage: 21.1+ KB
None


Unnamed: 0,id,Subject,label,Text
0,781400143144726529,غوتشي,1,إطلالة أنيقة وانثوية من مجموعة غوتشي لخريف/شتا...
1,781476002463711232,غوتشي,0,أخر صيحة من حقائب اليد من فندي وغوتشي https://...
2,781485165000396800,غوتشي,0,تتشرف #باري_غاليري بدعوتكم لحضور ومقابلة #نجوى...
3,781542717218254848,غوتشي,0,RT @ArabRapMonster: [ستايل]بنطال #نامجون الجين...
4,781730607919460352,غوتشي,1,RT @wufan_styleee: 160930 كريس في The Amazing ...


In [15]:
# Concat Arabic Data
ArabicData = pd.concat([ArbA[['Text','label']], ArbB[['Text','label']], ArbC[['Text','label']]])
print(ArabicData.info())
ArabicData.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1674 entries, 0 to 670
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    1674 non-null   object
 1   label   1674 non-null   object
dtypes: object(2)
memory usage: 39.2+ KB
None


Unnamed: 0,Text,label
0,إطلالة أنيقة وانثوية من مجموعة غوتشي لخريف/شتا...,1
1,أخر صيحة من حقائب اليد من فندي وغوتشي https://...,0
2,تتشرف #باري_غاليري بدعوتكم لحضور ومقابلة #نجوى...,1
3,RT @ArabRapMonster: [ستايل]بنطال #نامجون الجين...,0
4,RT @wufan_styleee: 160930 كريس في The Amazing ...,0


In [16]:
# Concat English Data
EnglishData = pd.concat([engA[['Text','label']], engB[['Text','label']], engC[['Text','label']]])
print(EnglishData.info())
EnglishData.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51815 entries, 0 to 20631
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    51815 non-null  object
 1   label   51815 non-null  object
dtypes: object(2)
memory usage: 1.2+ MB
None


Unnamed: 0,Text,label
0,"Picturehouse's, Pink Floyd's, 'Roger Waters: T...",0
1,Order Go Set a Watchman in store or through ou...,0
2,If these runway renovations at the airport pre...,-1
3,If you could ask an onstage interview question...,0
4,A portion of book sales from our Harper Lee/Go...,1


## Vectorization

In [17]:
# Make Folder To Save Preprocesses & Vectorization
if os.path.isdir('Preprocessors') == False:
    os.system("mkdir Preprocessors")

if os.path.isdir('Vectorized') == False:
    os.system("mkdir Vectorized")

In [18]:
# Vectorization Function
def Vectorizer(*, Data, DataName):
  if os.path.isfile(f'Preprocessors/BigramVectorizer{DataName}.joblib') == False:
      BigramVectorizer = CountVectorizer(ngram_range=(1, 2))  # Unigram & Bigram
      BigramVectorizer.fit(Data['Text'].values)
      dump(BigramVectorizer, f'Preprocessors/BigramVectorizer{DataName}.joblib')

  print(f'Loading BigramVectorizer{DataName}.joblib...')     
  BigramVectorizer = load(f'Preprocessors/BigramVectorizer{DataName}.joblib')
  print('Done!')

  if os.path.isfile(f'Vectorized/BigramTrainText{DataName}.npz') == False:
      BigramTrainText = BigramVectorizer.transform(Data['Text'].values)
      save_npz(f'Vectorized/BigramTrainText{DataName}.npz', BigramTrainText)

  print(f'Loading BigramTrainText{DataName}.npz...')     
  BigramTrainText = load_npz(f'Vectorized/BigramTrainText{DataName}.npz')
  print('Done!')
  return BigramTrainText

EnglishBigramTrainText = Vectorizer(Data=EnglishData, DataName='English')
ArabicBigramTrainText = Vectorizer(Data=ArabicData, DataName='Arabic')

Loading BigramVectorizerEnglish.joblib...
Done!
Loading BigramTrainTextEnglish.npz...
Done!
Loading BigramVectorizerArabic.joblib...
Done!
Loading BigramTrainTextArabic.npz...
Done!


In [19]:
# Make TDIDF(term frequency-inverse document frequency)
def BigramTDIDF(*, BigramTrainText, Name):
  if os.path.isfile(f'Preprocessors/BigramTFIDFTransformer{Name}.joblib') == False:
      BigramTFIDFTransformer = TfidfTransformer()
      BigramTFIDFTransformer.fit(BigramTrainText)
      dump(BigramTFIDFTransformer, f'Preprocessors/BigramTFIDFTransformer{Name}.joblib')

  print(f'Loading BigramTFIDFTransformer{Name}.joblib...')      
  BigramTFIDFTransformer = load(f'Preprocessors/BigramTFIDFTransformer{Name}.joblib')
  print('Done!')

  if os.path.isfile(f'Vectorized/BigramTFIDFTrainText{Name}.npz') == False:
      BigramTFIDFTrainText = BigramTFIDFTransformer.transform(BigramTrainText)
      save_npz(f'Vectorized/BigramTFIDFTrainText{Name}.npz', BigramTFIDFTrainText)

  print(f'Loading BigramTFIDFTrainText{Name}.npz...')      
  BigramTFIDFTrainText = load_npz(f'Vectorized/BigramTFIDFTrainText{Name}.npz')
  print('Done!')
  return BigramTFIDFTrainText

EnglishBigramTFIDFTrainText = BigramTDIDF(BigramTrainText=EnglishBigramTrainText, Name='English')  
ArabicBigramTFIDFTrainText = BigramTDIDF(BigramTrainText=ArabicBigramTrainText, Name='Arabic')  

Loading BigramTFIDFTransformerEnglish.joblib...
Done!
Loading BigramTFIDFTrainTextEnglish.npz...
Done!
Loading BigramTFIDFTransformerArabic.joblib...
Done!
Loading BigramTFIDFTrainTextArabic.npz...
Done!


## Training Models

In [20]:
# Train Model And Test Calculate Model Score 
def TrainAndTest(Model, modelName,  X: csc_matrix,  y: np.array, title: str):
    TextTrain, TextValid, LabelTrain, LabelValid = train_test_split(
        X, y, train_size=0.75, stratify=y)
    Model.fit(TextTrain, LabelTrain)
    TrainScore = Model.score(TextTrain, LabelTrain)
    TestScore = Model.score(TextValid, LabelValid)
    print(f'{title} with {modelName}\nTrain score: {round(TrainScore, 2)}')
    print(f'Validation score: {round(TestScore, 2)}\n ')           

In [21]:
model = SGDClassifier()

TrainAndTest(Model=model, modelName='SGD Classifier', X=EnglishBigramTFIDFTrainText,
             y=EnglishData['label'].values.astype('int'), title='English Data')   
 
TrainAndTest(Model=model, modelName='SGD Classifier', X=ArabicBigramTFIDFTrainText, 
             y=ArabicData['label'].values.astype('int'), title='Arabic Data')

English Data with SGD Classifier
Train score: 0.85
Validation score: 0.76
 
Arabic Data with SGD Classifier
Train score: 0.89
Validation score: 0.72
 


In [22]:
model = LinearRegression()

TrainAndTest(Model=model, modelName='Linear Reression', X=EnglishBigramTFIDFTrainText,
             y=EnglishData['label'].values.astype('int'), title='English Data')   
 
TrainAndTest(Model=model, modelName='Linear Reression', X=ArabicBigramTFIDFTrainText, 
             y=ArabicData['label'].values.astype('int'), title='Arabic Data')

English Data with Linear Reression
Train score: 0.91
Validation score: 0.67
 
Arabic Data with Linear Reression
Train score: 0.83
Validation score: 0.36
 


In [23]:
model = LogisticRegression()

TrainAndTest(Model=model, modelName='Logistic Reression', X=EnglishBigramTFIDFTrainText,
             y=EnglishData['label'].values.astype('int'), title='English Data')   
 
TrainAndTest(Model=model, modelName='Logistic Reression', X=ArabicBigramTFIDFTrainText, 
             y=ArabicData['label'].values.astype('int'), title='Arabic Data')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


English Data with Logistic Reression
Train score: 0.9
Validation score: 0.79
 
Arabic Data with Logistic Reression
Train score: 0.9
Validation score: 0.7
 


In [24]:
model = AdaBoostClassifier()

TrainAndTest(Model=model, modelName='Adaptive Boosting', X=EnglishBigramTFIDFTrainText,
             y=EnglishData['label'].values.astype('int'), title='English Data')   
 
TrainAndTest(Model=model, modelName='Adaptive Boosting', X=ArabicBigramTFIDFTrainText, 
             y=ArabicData['label'].values.astype('int'), title='Arabic Data')

English Data with Adaptive Boosting
Train score: 0.56
Validation score: 0.56
 
Arabic Data with Adaptive Boosting
Train score: 0.59
Validation score: 0.59
 


In [25]:
model = RandomForestClassifier(max_depth=10 , random_state=0)

TrainAndTest(Model=model, modelName='Random Forest', X=EnglishBigramTFIDFTrainText,
             y=EnglishData['label'].values.astype('int'), title='English Data')   
 
TrainAndTest(Model=model, modelName='Random Forest', X=ArabicBigramTFIDFTrainText, 
             y=ArabicData['label'].values.astype('int'), title='Arabic Data')

English Data with Random Forest
Train score: 0.47
Validation score: 0.46
 
Arabic Data with Random Forest
Train score: 0.57
Validation score: 0.52
 


In [26]:
model = DecisionTreeClassifier()

TrainAndTest(Model=model, modelName='Decision Tree', X=EnglishBigramTFIDFTrainText,
             y=EnglishData['label'].values.astype('int'), title='English Data')   
 
TrainAndTest(Model=model, modelName='Decision Tree', X=ArabicBigramTFIDFTrainText, 
             y=ArabicData['label'].values.astype('int'), title='Arabic Data')

English Data with Decision Tree
Train score: 0.93
Validation score: 0.79
 
Arabic Data with Decision Tree
Train score: 0.9
Validation score: 0.68
 
