Import libraries and modules

In [28]:
import nltk
from collections import Counter
import plotly.graph_objects as go
import plotly.offline
import psycopg2
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

Download NLTK Resources

In [29]:
def download_nltk_resources():
    """Download required NLTK resources for text analysis."""
    try:
        nltk.download("punkt_tab", quiet=True)
        nltk.download("stopwords", quiet=True)
        logger.info("NLTK resources downloaded successfully")
    except Exception as e:
        logger.error(f"Failed to download NLTK resources: {str(e)}")
        raise

download_nltk_resources()

2025-04-07 21:48:34,819 - INFO - NLTK resources downloaded successfully


Fetch Job Descriptions from Database

In [30]:
def fetch_job_descriptions():
    """Fetch job description text from the PostgreSQL database.

    Returns:
        List of job description text strings.
    """
    try:
        conn = psycopg2.connect(
            dbname="api_development_db",
            user="postgres",
            password="semah",  
            host="localhost",
            port="5432"
        )
        cursor = conn.cursor()
        cursor.execute("SELECT text FROM job_descriptions")
        job_texts = [row[0] for row in cursor.fetchall()]
        conn.close()
        if not job_texts:
            logger.warning("No job descriptions found in the database")
        else:
            logger.info(f"Fetched {len(job_texts)} job descriptions from database")
        return job_texts
    except Exception as e:
        logger.error(f"Error fetching job descriptions: {str(e)}")
        return []

job_texts = fetch_job_descriptions()

2025-04-07 21:48:37,592 - INFO - Fetched 14 job descriptions from database


Analyze Text Function

In [None]:
def analyze_text(text_data):
    """Analyze textual data to extract word frequency and basic statistics, excluding numbers.

    Args:
        text_data: List of text strings to analyze.

    Returns:
        Dictionary with top words and statistics.
    """
    try:
        all_text = " ".join(text_data)
        stop_words = set(nltk.corpus.stopwords.words("english"))
        words = nltk.word_tokenize(all_text.lower())
        filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
        word_freq = Counter(filtered_words)
        top_words = word_freq.most_common(20)

        total_docs = len(text_data)
        total_words = len(words)
        unique_words = len(set(words))
        avg_words_per_doc = total_words / total_docs if total_docs > 0 else 0

        stats = {
            "total_documents": total_docs,
            "total_words": total_words,
            "unique_words": unique_words,
            "average_words_per_document": avg_words_per_doc
        }

        logger.info("Text analysis completed with statistics")
        return {"top_words": top_words, "stats": stats}
    except Exception as e:
        logger.error(f"Error during text analysis: {str(e)}")
        return {"top_words": [], "stats": {}}

analysis_result = analyze_text(job_texts)
top_words = analysis_result["top_words"]
stats = analysis_result["stats"]
logger.info(f"Top words: {top_words}")
logger.info(f"Statistics: {stats}")

2025-04-07 21:24:14,118 - INFO - Text analysis completed, top 20 alphabetic words extracted
2025-04-07 21:24:14,120 - INFO - Top words: [('job', 14), ('years', 13), ('legal', 8), ('position', 6), ('experience', 6), ('assistant', 5), ('offering', 5), ('case', 5), ('preparation', 5), ('analysis', 5), ('law', 4), ('attorney', 3), ('practice', 3), ('intellectual', 3), ('property', 3), ('client', 3), ('counseling', 3), ('role', 3), ('paralegal', 2), ('expertise', 2)]


Generate and Display Plot

In [31]:
def display_word_frequency_plot(top_words):
    """Generate and display a bar plot of word frequencies using Plotly in the notebook.

    Args:
        top_words: List of tuples with (word, frequency) pairs.
    """
    try:
        words = [word for word, _ in top_words]
        frequencies = [freq for _, freq in top_words]
        
        fig = go.Figure(data=[go.Bar(x=words, y=frequencies)])
        fig.update_layout(
            title="Top 20 Most Frequent Words in Job Descriptions",
            xaxis_title="Words",
            yaxis_title="Frequency"
        )
        
        plotly.offline.init_notebook_mode(connected=True)
        fig.show()
        logger.info("Word frequency plot displayed in notebook")
    except Exception as e:
        logger.error(f"Error generating plot: {str(e)}")

display_word_frequency_plot(top_words)

2025-04-07 21:49:21,470 - INFO - Word frequency plot displayed in notebook
