In [21]:
from flask import Flask, request, jsonify
from googletrans import Translator
import joblib
from nltk.stem import PorterStemmer
from spellchecker import SpellChecker
from ar_corrector.corrector import Corrector

app = Flask(__name__)

# Load the trained model and vectorizer
model = joblib.load('model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Initialize Google Translate, SpellChecker, and Aspell
translator = Translator()
spell_checker = SpellChecker()
corr = Corrector()

@app.route('/predict', methods=['POST'])
def predict():
    # Get job title from request body
    request_data = request.get_json()
    job_title = request_data['jobtitle']

    # Check if the job title is in Arabic
    if is_arabic(job_title):
        # Correct spelling in Arabic
        job_title_corrected = corr.contextual_correct(job_title)
        print(f"Corrected Arabic Job Title: {job_title_corrected}")
        # Translate Arabic to English
        translation = translator.translate(job_title_corrected, src='ar', dest='en')
        job_title_en = translation.text
    else:
        # Correct spelling in English
        job_title_corrected = correct_spelling(job_title)
        print(f"Corrected Job Title: {job_title_corrected}")
        job_title_en = job_title_corrected
    
    # Preprocess job title
    job_title_processed = preprocess_text(job_title_en)
    print(f"Processed Job Title: {job_title_processed}")
    
    # Predict subcategory
    job_title_vect = vectorizer.transform([job_title_processed])
    prediction = model.predict(job_title_vect)[0]

    response = {'subcategory': prediction}
    return jsonify(response)

def correct_spelling(text):
    corrected_words = [spell_checker.correction(word) for word in text.split()]
    return ' '.join(corrected_words)


def preprocess_text(text):
    # Stemming using NLTK Porter Stemmer
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in text.split()])

def is_arabic(text):
    # Simple check for Arabic characters in the text
    arabic_range = any(0x0600 <= ord(char) <= 0x06FF or 
                       0x0750 <= ord(char) <= 0x077F or 
                       0xFB50 <= ord(char) <= 0xFDFF or
                       0xFE70 <= ord(char) <= 0xFEFF or
                       0x1EE00 <= ord(char) <= 0x1EEFF for char in text)
    return arabic_range

if __name__ == '__main__':
    app.run(port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


Corrected Arabic Job Title: بات طعام في المطبخ


127.0.0.1 - - [03/Jul/2024 15:51:34] "POST /predict HTTP/1.1" 200 -


Processed Job Title: food in the kitchen
Corrected Arabic Job Title: طعام


127.0.0.1 - - [03/Jul/2024 15:51:53] "POST /predict HTTP/1.1" 200 -


Processed Job Title: food
Corrected Arabic Job Title: مطعم


127.0.0.1 - - [03/Jul/2024 15:52:07] "POST /predict HTTP/1.1" 200 -


Processed Job Title: restaur
Corrected Arabic Job Title: تصليح


127.0.0.1 - - [03/Jul/2024 15:52:33] "POST /predict HTTP/1.1" 200 -


Processed Job Title: to repair
Corrected Arabic Job Title: تصوير


127.0.0.1 - - [03/Jul/2024 15:52:49] "POST /predict HTTP/1.1" 200 -


Processed Job Title: film
Corrected Arabic Job Title: تصوير


127.0.0.1 - - [03/Jul/2024 15:52:59] "POST /predict HTTP/1.1" 200 -


Processed Job Title: film
Corrected Arabic Job Title: تصوير أجهزة كهربائية


127.0.0.1 - - [03/Jul/2024 15:54:26] "POST /predict HTTP/1.1" 200 -


Processed Job Title: photographi of electr applianc
Corrected Arabic Job Title: الا أجهزة كهربائية


127.0.0.1 - - [03/Jul/2024 15:55:10] "POST /predict HTTP/1.1" 200 -


Processed Job Title: onli electr applianc
Corrected Arabic Job Title: الا أجهزة كهربائية


127.0.0.1 - - [03/Jul/2024 15:55:32] "POST /predict HTTP/1.1" 200 -


Processed Job Title: onli electr applianc


In [26]:
from flask import Flask, request, jsonify
from googletrans import Translator
import joblib
from nltk.stem import PorterStemmer
from ar_corrector.corrector import Corrector
from pyarabic.araby import strip_tashkeel, normalize_hamza
import enchant

app = Flask(__name__)

# Load the trained model and vectorizer
model = joblib.load('model.pkl')
vectorizer = joblib.load('vectorizer.pkl')

# Initialize Google Translate, Enchant SpellChecker, and Corrector
translator = Translator()
spell_checker_en = enchant.Dict("en_US")  # English spell checker
corr = Corrector()  # Arabic corrector

@app.route('/predict', methods=['POST'])
def predict():
    # Get job title from request body
    request_data = request.get_json()
    job_title = request_data['jobtitle']

    # Check if the job title is in Arabic
    if is_arabic(job_title):
        # Correct spelling and normalization in Arabic
        job_title_normalized = normalize_arabic_text(job_title)
        job_title_corrected = corr.contextual_correct(job_title_normalized)
        print(f"Corrected Arabic Job Title: {job_title_corrected}")
        
        # Translate Arabic to English
        translation = translator.translate(job_title_corrected, src='ar', dest='en')
        job_title_en = translation.text
    else:
        # Correct spelling in English
        job_title_corrected = correct_spelling_en(job_title)
        print(f"Corrected Job Title: {job_title_corrected}")
        job_title_en = job_title_corrected
    
    # Preprocess job title
    job_title_processed = preprocess_text(job_title_en)
    print(f"Processed Job Title: {job_title_processed}")
    
    # Predict subcategory
    job_title_vect = vectorizer.transform([job_title_processed])
    prediction = model.predict(job_title_vect)[0]

    response = {'subcategory': prediction}
    return jsonify(response)

def correct_spelling_en(text):
    corrected_words = []
    for word in text.split():
        if not spell_checker_en.check(word):
            suggestions = spell_checker_en.suggest(word)
            if suggestions:
                corrected_words.append(suggestions[0])
            else:
                corrected_words.append(word)
        else:
            corrected_words.append(word)
    return ' '.join(corrected_words)

def normalize_arabic_text(text):
    normalized_text = normalize_hamza(strip_tashkeel(text))
    return normalized_text

def preprocess_text(text):
    # Stemming using NLTK Porter Stemmer
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in text.split()])

def is_arabic(text):
    # Simple check for Arabic characters in the text
    arabic_range = any(0x0600 <= ord(char) <= 0x06FF or 
                       0x0750 <= ord(char) <= 0x077F or 
                       0xFB50 <= ord(char) <= 0xFDFF or
                       0xFE70 <= ord(char) <= 0xFEFF or
                       0x1EE00 <= ord(char) <= 0x1EEFF for char in text)
    return arabic_range

if __name__ == '__main__':
    app.run(port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [03/Jul/2024 16:50:12] "POST /predict HTTP/1.1" 200 -


Corrected Job Title: cleansing
Processed Job Title: cleans


127.0.0.1 - - [03/Jul/2024 16:50:26] "POST /predict HTTP/1.1" 200 -


Corrected Job Title: cleanser
Processed Job Title: cleanser
