In [None]:
!pip install farasapy
!pip install tqdm
!pip install gensim
!pip install spacy
!pip install scipy
!pip install wandb
!pip install nltk


In [None]:
import pandas as pd 
import re
import string
import numpy as np
import matplotlib.pyplot as plt
import pickle
from time import time
from pathlib import Path
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt




import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from farasa.stemmer import FarasaStemmer
from farasa.segmenter import FarasaSegmenter
import unicodedata
from nltk.stem import WordNetLemmatizer
import joblib
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
arabic_stopwords = set(stopwords.words("arabic"))
farasa_stemmer = FarasaStemmer(interactive=True)
stop_words = stopwords.words('english')

In [None]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [None]:
class TextPreprocessor:
    def __init__(self, language='en'):
        self.language = language.lower()
        
        if self.language == 'en':
            self.stopwords = set(stopwords.words('english'))
            self.stemmer = SnowballStemmer('english')
        elif self.language == 'ar':
            self.stopwords = self._load_arabic_stopwords()
            self.farasa_segmenter = FarasaSegmenter(interactive=True)
            self.farasa_stemmer = FarasaStemmer(interactive=True)
        else:
            raise ValueError("Language not supported. Choose 'english' or 'arabic'.")

        self.pattern_punctuation = re.compile(r'[^\w\s]')
        self.pattern_digits = re.compile(r'\d+')
        self.pattern_spaces = re.compile(r'\s+')


     

    def _load_arabic_stopwords(self):
        return set(stopwords.words('arabic')) if 'arabic' in stopwords.fileids() else set()

    def clean_text(self, text):
        """Clean the text by removing punctuation, digits, and extra spaces."""
        text = self.pattern_punctuation.sub('', text)  
        text = self.pattern_digits.sub('', text)  
        text = self.pattern_spaces.sub(' ', text).strip()  
        return text

    def normalize_text(self, text):
        """Normalize text to lowercase."""
        text = text.lower()
        if self.language == 'ar':
            text = self._normalize_arabic(text)
        return text

    def _normalize_arabic(self, text):
        """Normalize Arabic text by replacing similar characters."""
        arabic_norm_map = {
            'أ': 'ا', 'إ': 'ا', 'آ': 'ا',
            'ة': 'ه',
            'ي': 'ى',
        }
        return ''.join(arabic_norm_map.get(c, c) for c in text)

    def remove_diacritics(self, text):
        """Remove Arabic diacritics (tashkeel)."""
        return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')

    def tokenize_text(self, text):
        """Tokenize text into words using SpaCy for English or Farasa for Arabic."""
        if self.language == 'en':
            doc = nlp(text)
            return [token.text for token in doc if token.is_alpha]  # Only alphabetic tokens
        elif self.language == 'ar':
            return self.farasa_segmenter.segment(text).split()

    def remove_stopwords(self, tokens):
        """Remove stopwords from tokenized text."""
        return [word for word in tokens if word not in self.stopwords]

    def stem_text(self, tokens):
        """Stem words using appropriate stemmer."""
        if self.language == 'en':
            return [self.stemmer.stem(word) for word in tokens]
        elif self.language == 'ar':
            return [self.farasa_stemmer.stem(word) for word in tokens]

    def preprocess_text(self, text):
        """Apply all preprocessing steps."""
        text = self.clean_text(text)
        text = self.normalize_text(text)
        if self.language == 'ar':
            text = self.remove_diacritics(text)
        tokens = self.tokenize_text(text)
        tokens = self.remove_stopwords(tokens)
        tokens = self.stem_text(tokens)
        return ' '.join(tokens)

    
    def parallel_preprocess(self, df, column_name, num_workers=4):
        """Apply preprocessing using ThreadPoolExecutor to a DataFrame column."""
        with ThreadPoolExecutor(max_workers=num_workers) as executor:
            
            result = list(tqdm(executor.map(self.preprocess_text, df[column_name]), 
                               total=len(df), 
                               desc="Processing Text Data"))

        
        df[column_name] = result
        return df

    def preprocess(self, df, column_name):
        """Apply preprocessing to a Pandas DataFrame column using tqdm."""
        tqdm.pandas(desc="Processing Text Data")
        df[column_name] = df[column_name].progress_apply(self.preprocess_text)
        return df


    
    def export_to_csv(self, df, filename):
        """Export the preprocessed DataFrame to a CSV file."""
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Data exported successfully to {filename}")

English version

In [None]:
df_en =  pd.read_csv('.\Master_Data\English Data\WELFake_Dataset.csv')
df_en.head()

In [None]:
eng_preprocessor = TextPreprocessor(language='en')
df_en = eng_preprocessor.preprocess(df_en, 'text')

In [None]:
df_en['label'] = 1 - df_en['label']

In [None]:
df_en.dropna(inplace=True)

In [None]:
df_en['label'] = df_en['label'].astype(int)

df_en['label'].value_counts()

In [None]:
df_en['text_length'] = df_en['text'].str.len()
print(df_en['text_length'].describe())

In [None]:
df_en = df_en[(df_en['text_length'] >= 50) & (df_en['text_length'] <= 10000)]

In [None]:
df_en.dropna(inplace=True)
df_en = df_en.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
eng_preprocessor.export_to_csv(df_en , './Cleaned_Data/processed_en_data.csv')

Arabic Version

In [None]:


df_ar = pd.read_csv("hf://datasets/Nahla-yasmine/arabic_fake_news/final_data.csv")

In [None]:
arb_preprocessor = TextPreprocessor(language='ar')
df_ar = arb_preprocessor.preprocess(df_ar, 'text')

In [None]:
df_ar.dropna(inplace=True)


In [None]:

label_map = {
    'fake': 1,
    'false': 1,
    'real': 0,
    'true': 0
}

df_ar['label'] = df_ar['label'].str.lower().map(label_map).astype(int)

In [None]:
label_distribution = df_ar['label'].value_counts()
print(label_distribution)

In [None]:
df_real = df_ar[df_ar['label'] == 1]
df_fake = df_ar[df_ar['label'] == 0]

df_real_sampled = df_real.sample(len(df_fake), replace=True, random_state=42)

df_balanced = pd.concat([df_fake, df_real_sampled])

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_balanced['label'].value_counts())


In [None]:
df_ar = df_balanced

In [None]:
df_ar['text_length'] = df_ar['text'].str.len()
print(df_ar['text_length'].describe())

In [None]:
df_ar = df_ar[(df_ar['text_length'] >= 20) & (df_ar['text_length'] <= 10000)]

In [None]:
df_ar.dropna(inplace=True)
df_ar = df_ar.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
arb_preprocessor.export_to_csv(df_en , './Cleaned_Data/processed_en_data.csv')