<a href="https://colab.research.google.com/github/safa-abidi/NLP-sentimentAnalysis-AraBert/blob/main/co_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [2]:
import pandas as pd

import plotly.express as px

import xml.etree.ElementTree as ET
import pandas as pd
import os

from sklearn.model_selection import train_test_split 
from sklearn.svm import SVC
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [3]:
from typing import Any, Dict, List, Callable, Optional, Tuple, Union
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import torch

In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import torch
import transformers

In [6]:
from google.colab import drive
drive.mount('/content/drive') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Dataset: labelled and unlabled

In [7]:
DATASETS_PATH = '/content/drive/MyDrive/'

%cd /content/drive/MyDrive/datasets

# Labeled Dataset
labeled_dataset_path = 'dataset_arabic_translated/Merged_Labelled_Arabic_fake_news_Dataset.xlsx'
labeled_dataset = pd.read_excel(labeled_dataset_path)
print("labeled dataset", labeled_dataset.shape)

# Unlabeled Dataset
unlabeled_dataset_path = 'SemEval2016_arabic/AR_Hotels_Train_SB1.xml'
tree = ET.parse(unlabeled_dataset_path)
root = tree.getroot()

###################################
# New unlabelled dataset to add the the old one
new_unlabeled_dataset_path = 'Merged_Unlabelled_Arabic_scraped_Hotel reviews.xlsx'
new_unlabeled_dataset = pd.read_excel(new_unlabeled_dataset_path, sheet_name=1)
print("new unlabeled dataset ", new_unlabeled_dataset.shape)
###################################

# Extract reviews from XML and add to list of tuples
reviews = []
for review in root.iter('Review'):
    for sentence in review.iter('sentence'):
        text = sentence.find('text').text.strip()
        label = -1
        reviews.append((text, label))

# Create a pandas DataFrame from the list of tuples
unlabeled_dataset1 = pd.DataFrame(reviews, columns=['Review', 'Class'])
unlabeled_dataset = pd.concat([unlabeled_dataset1, new_unlabeled_dataset], axis=0)

print("Unlabeled Dataset 1",unlabeled_dataset1.shape)
print("all unlabeled ", unlabeled_dataset.shape)


/content/drive/MyDrive/datasets
labeled dataset (1600, 6)
new unlabeled dataset  (799, 3)
Unlabeled Dataset 1 (4802, 2)
all unlabeled  (5601, 4)


In [8]:
## Encode the dataset labels
labeled_dataset['Label'] = labeled_dataset.apply(lambda x: 0 if x['Class'] == 'خادع' else 1 if x['Class'] == 'صادقة' else -1, axis=1)
print(labeled_dataset.shape)
print(labeled_dataset)

(1600, 7)
     Hotel name                                             Review Source  \
0       Affinia  فندق Affinia Chicago هو أحد أسوأ الفنادق التي ...  Mturk   
1       Affinia  مكثت في Affina Chicago للاحتفال بالذكرى السنوي...  Mturk   
2       Affinia  من الواضح أن فندق Affinia في شيكاغو يخدم ضيوف ...  Mturk   
3       Affinia  محبط للغاية في إقامتنا. هذا الفندق لا يشبه ما ...  Mturk   
4       Affinia  كان فندق Affinia في شيكاغو أحد أكثر الفنادق إث...  Mturk   
...         ...                                                ...    ...   
1595    Talbolt  الموقع والموقع والموقع. يا له من اكتشاف رائع! ...  الويب   
1596    Talbolt  أقمنا هنا بسبب كل التقييمات الرائعة ، وكلها صح...  الويب   
1597    Talbolt  مكثت في Talbott مرتين في الأسابيع القليلة الما...  الويب   
1598    Talbolt  أقمنا أنا وزوجي هنا لمدة ثلاث ليال أثناء زيارة...  الويب   
1599    Talbolt  مكثت 5 ليال في هذا الفندق الرائع. الغرف المحدث...  الويب   

     Opinion  Class  Unnamed: 5  Label  
0        نفي   خادع     

In [9]:
# Define the tokenizer and the Arabert model
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
X_labeled = labeled_dataset["Review"].values
y_labeled = labeled_dataset["Label"].values
X_labeled_train, X_labeled_test, y_labeled_train, y_labeled_test = train_test_split(X_labeled, y_labeled, test_size=0.2, random_state=42)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def extract_features1(text):
    # Tokenize the text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    
    # Encode the input features with the model
    outputs = model(**inputs)
    
    # Extract the embeddings for the [CLS] token
    embeddings = outputs.last_hidden_state[:, 0, :]
    
    return embeddings.detach().numpy()


def extract_features2(text):
    # Tokenize the text with a different tokenizer
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    
    # Encode the input features with a different model
    outputs = model(**inputs)
    
    # Extract the embeddings for the [CLS] token
    embeddings = outputs.last_hidden_state[:, 0, :]
    
    return embeddings.detach().numpy()



In [11]:
print(unlabeled_dataset)

                                                Review  Class    Hotel Name  \
0    أنصح بالنوم وليس تناول الطعام  موقع مثالي للإق...   -1.0           NaN   
1    كانت الغرفة ممتازة وكذلك الموظفون وبوفيه الإفط...   -1.0           NaN   
2    فندق يتميز بمرافق نوعيّة وخلّاقة وساخنة. قم بش...   -1.0           NaN   
3    لمسة بحرية  جميل وظريفة، فندق تقليدي في المدين...   -1.0           NaN   
4    سأوصي بالتأكيد بموقع المدينة القديمة إلا إنه ع...   -1.0           NaN   
..                                                 ...    ...           ...   
794  Lemya تجعل إقامتي دائمًا مميزة. مفيد للغاية وم...    NaN  Radisson Blu   
795  تجربة لا تنسى حقا. فندق من الدرجة الأولى يقع ف...    NaN  Radisson Blu   
796  أنا أحب هذا الفندق الكثير من الرسوم المتحركة ا...    NaN  Radisson Blu   
797  لإعطاء بعض السياق ، لقد حجزت 3 غرف:  الغرفة 1 ...    NaN  Radisson Blu   
798  أردنا البقاء في الفندق (لأننا مسلمون) سألنا عن...    NaN  Radisson Blu   

                           Review Title  
0        

In [13]:
# Define two different feature sets
X_feat1_unlabeled = extract_features1(unlabeled_dataset)
X_feat2_unlabeled = extract_features2(unlabeled_dataset)

TypeError: ignored

In [None]:
from sklearn.linear_model import LogisticRegression

# Train two classifiers on different feature sets
clf1 = LogisticRegression()
clf2 = LogisticRegression()

X_feat1_train = extract_features1(X_labeled_train)
X_feat2_train = extract_features2(X_labeled_train)

clf1.fit(X_feat1_train, y_labeled_train)
clf2.fit(X_feat2_train, y_labeled_train)

In [14]:
num_iterations = 3
# Iterate over the co-training process
for i in range(num_iterations):
    # Generate pseudo-labeled data using the two classifiers
    X_feat1_unlabeled = extract_features1(unlabeled_dataset)
    X_feat2_unlabeled = extract_features2(unlabeled_dataset)

    y_pseudo1 = clf1.predict(X_feat1_unlabeled)
    y_pseudo2 = clf2.predict(X_feat2_unlabeled)

    # Select the most confident examples from each classifier
    conf1 = np.max(clf1.predict_proba(X_feat1_unlabeled), axis=1)
    conf2 = np.max(clf2.predict_proba(X_feat2_unlabeled), axis=1)

    ind1 = np.argsort(-conf1)[:num_select]
    ind2 = np.argsort(-conf2)[:num_select]

    X_selected1 = X_feat1_unlabeled[ind1]
    y_selected1 = y_pseudo1[ind1]

    X_selected2 = X_feat2_unlabeled[ind2]
    y_selected2 = y_pseudo2[ind2]

    # Retrain the classifiers on the labeled data and selected data
    X_feat1_labeled = np.concatenate([X_feat1_train, X_selected1])
    y_labeled1 = np.concatenate([y_labeled_train, y_selected1])
    clf1.fit(X_feat1_labeled, y_labeled1)

    X_feat2_labeled = np.concatenate([X_feat2_train, X_selected2])
    y_labeled2 = np.concatenate([y_labeled_train, y_selected2])
    clf2.fit(X_feat2_labeled, y_labeled2)

    # Evaluate the performance of the co-trained classifiers
    X_feat1_test = extract_features1(X_labeled_test)
    X_feat2_test = extract_features2(X_labeled_test)

    y_pred1 = clf1.predict(X_feat1_test)
    y_pred2 = clf2.predict(X_feat2_test)

    y_ensemble = np.where((y_pred1 == y_pred2), y_pred1, np.nan)

    acc_ensemble = accuracy_score(y_labeled_test, y_ensemble)
    print(f"Iteration {i+1}, ensemble accuracy: {acc_ensemble:.3f}")

ValueError: ignored