<a href="https://colab.research.google.com/github/schwarer2006/CV_Job_Match/blob/main/Komplexes_JOB_CV_Script_pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install -r requirements.txt

In [None]:
import pdfplumber
import re
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from matplotlib.backends.backend_pdf import PdfPages
from datetime import datetime

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

def load_text(file_path):
    """Loads and cleans text from a given file path."""
    if file_path.endswith('.pdf'):
        text = []
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text.append(page_text)
        text = ' '.join(text) if text else ''
    else:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

def tokenize_and_remove_stopwords(text):
    """Tokenizes text and removes German stopwords."""
    stop_words = set(stopwords.words('german'))
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words]

def calculate_cosine_similarity(text1, text2):
    """Calculates cosine similarity between two texts."""
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform([text1, text2]).toarray()
    return cosine_similarity(vectors)[0][1]

def save_visualizations_to_pdf(similarity_score, name, vorname, stellenname, firma, resume_text, job_description_text, resume_tokens, job_description_tokens):
    """Saves various text analysis visualizations to a PDF file."""
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    pdf_path = f'/content/visualizations_{name}_{vorname}_{timestamp}.pdf'
    score_color = 'red' if similarity_score <= 0.50 else 'green'

    with PdfPages(pdf_path) as pdf:
        # Personal and match data on one page
        fig, ax = plt.subplots(figsize=(8, 6))
        ax.text(0.5, 0.6, f"Name: {name}\nVorname: {vorname}\nStellenname: {stellenname}\nFirma: {firma}\nDatum: {datetime.now().strftime('%d.%m.%Y')}",
                fontsize=12, ha='center', va='center', transform=ax.transAxes)
        ax.text(0.5, 0.5, f"Übereinstimmung: {similarity_score * 100:.2f}%", fontsize=12, ha='center', va='center', color=score_color, transform=ax.transAxes)
        ax.axis('off')
        pdf.savefig(fig)
        plt.close(fig)

        # Additional plots (Wordclouds and Bar Charts)...
        # Wordclouds for both texts
        for text, title in [(resume_text, 'Lebenslauf Wordcloud'), (job_description_text, 'Stellenbeschreibung Wordcloud')]:
            wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
            fig, ax = plt.subplots(figsize=(8, 4))
            ax.imshow(wordcloud, interpolation='bilinear')
            ax.set_title(title)
            ax.axis('off')
            pdf.savefig(fig)
            plt.close(fig)

        # Word frequency bar charts
        for tokens, title in [(resume_tokens, 'Lebenslauf Wortfrequenz'), (job_description_tokens, 'Stellenbeschreibung Wortfrequenz')]:
            fig, ax = plt.subplots(figsize=(8, 6))
            word_freq = Counter(tokens)
            df = pd.DataFrame(word_freq.most_common(20), columns=['Word', 'Frequency'])
            sns.barplot(x='Frequency', y='Word', data=df, ax=ax, palette='viridis')
            ax.set_title(title)
            pdf.savefig(fig)
            plt.close(fig)

        # Radar Plot for Skill Comparison
        common_tokens = set(resume_tokens) & set(job_description_tokens)
        resume_freq = Counter(resume_tokens)
        job_desc_freq = Counter(job_description_tokens)
        radar_data = {token: [resume_freq[token], job_desc_freq[token]] for token in common_tokens}
        df = pd.DataFrame(radar_data, index=['Resume', 'Job Description']).T
        angles = np.linspace(0, 2 * np.pi, len(common_tokens), endpoint=False).tolist()
        fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
        ax.fill(angles, df['Resume'], color='red', alpha=0.25)
        ax.fill(angles, df['Job Description'], color='green', alpha=0.25)
        ax.set_yticklabels([])
        ax.set_xticks(angles)
        ax.set_xticklabels(df.index, rotation=45, fontsize=8)
        pdf.savefig(fig)
        plt.close(fig)


    print(f"PDF saved at: {pdf_path}")

input_type = input("Enter the type of input for job description (1 - Text File, 2 - PDF File): ")
if input_type == '1':
    job_description_path = input("Enter the path to the text file for job description: ")
elif input_type == '2':
    job_description_path = input("Enter the path to the PDF file for job description: ")
else:
    print("Invalid input. Please restart the script and enter 1 or 2.")
    exit()

job_description_text = load_text(job_description_path)
job_description_tokens = tokenize_and_remove_stopwords(job_description_text)

# Input fields for resume and personal details
resume_path = input("Enter the path to your resume (text or PDF): ")
resume_text = load_text(resume_path)
resume_tokens = tokenize_and_remove_stopwords(resume_text)
similarity_score = calculate_cosine_similarity(' '.join(resume_tokens), ' '.join(job_description_tokens))

name = input("Enter your last name: ")
vorname = input("Enter your first name: ")
stellenname = input("Enter the job title: ")
firma = input("Enter the company name: ")

save_visualizations_to_pdf(similarity_score, name, vorname, stellenname, firma, resume_text, job_description_text, resume_tokens, job_description_tokens)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter the type of input for job description (1 - Text File, 2 - PDF File): 1
Enter the path to the text file for job description: /content/stellenbeschreibung.txt
Enter the path to your resume (text or PDF): /content/kenntnisse.txt
Enter your last name: Schwarz
Enter your first name: Erik
Enter the job title: BI Developer
Enter the company name: PostFinance
PDF saved at: /content/visualizations_Schwarz_Erik_2024-05-22_08-33-00.pdf
