In [None]:
import streamlit as st
import re
import joblib
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import fitz  # PyMuPDF

# Load pre-trained models and vectorizers
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
logistic_regression_model_tfidf = joblib.load('logistic_regression_model_tfidf.pkl')
logistic_regression_model_bert = joblib.load('logistic_regression_model_bert.pkl')

# Function to get BERT embeddings
def get_bert_embeddings(text):
    inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
    return embeddings

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

# Streamlit app
st.title('Document Classification')
st.write('Upload multiple PDF documents from different labels to classify their type.')

uploaded_files = st.file_uploader("Choose documents...", type=["pdf"], accept_multiple_files=True)

embedding_choice = st.selectbox(
    'Select embedding method',
    ('TF-IDF', 'BERT')
)

if uploaded_files and len(uploaded_files) > 0:
    st.write("Processing PDFs...")

    results = []

    for uploaded_file in uploaded_files:
        text = extract_text_from_pdf(uploaded_file)

        if embedding_choice == 'TF-IDF':
            features = tfidf_vectorizer.transform([text])
            prediction = logistic_regression_model_tfidf.predict(features)
            prediction_proba = logistic_regression_model_tfidf.predict_proba(features)
        else:
            features = get_bert_embeddings(text)
            prediction = logistic_regression_model_bert.predict(features)
            prediction_proba = logistic_regression_model_bert.predict_proba(features)
        
        # Format confidence scores as a string for display
        confidence_scores = ', '.join(f'{score:.2f}' for score in prediction_proba[0])

        results.append({
            'File Name': uploaded_file.name,
            'Predicted Label': prediction[0],
            'Sample Text': text[:100],  # Displaying only the first 100 characters for brevity
            'Confidence Scores': confidence_scores
        })

    results_df = pd.DataFrame(results)

    st.write("Classification Results")
    st.dataframe(results_df)

In [None]:
import streamlit as st
import re
import joblib
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import fitz  # PyMuPDF

# Load pre-trained models and vectorizers
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
logistic_regression_model_tfidf = joblib.load('logistic_regression_model_tfidf.pkl')
logistic_regression_model_bert = joblib.load('logistic_regression_model_bert.pkl')

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

# Function to get TF-IDF features
def get_tfidf_features(text):
    cleaned_text = clean_text(text)
    return tfidf_vectorizer.transform([cleaned_text])

# Function to get BERT embeddings
def get_bert_embeddings(text):
    inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
    return embeddings

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

# Streamlit app
st.title('Document Classification')
st.write('Upload multiple PDF documents from different labels to classify their type.')

uploaded_files = st.file_uploader("Choose documents...", type=["pdf"], accept_multiple_files=True)

embedding_choice = st.selectbox(
    'Select embedding method',
    ('TF-IDF', 'BERT')
)

if uploaded_files and len(uploaded_files) > 0:
    st.write("Processing PDFs...")

    results = []

    for uploaded_file in uploaded_files:
        text = extract_text_from_pdf(uploaded_file)

        if embedding_choice == 'TF-IDF':
            features = get_tfidf_features(text)
            prediction = logistic_regression_model_tfidf.predict(features)
            confidence_score = logistic_regression_model_tfidf.predict_proba(features).max()
        else:
            features = get_bert_embeddings(text)
            prediction = logistic_regression_model_bert.predict(features)
            confidence_score = logistic_regression_model_bert.predict_proba(features).max()

        results.append({
            'File Name': uploaded_file.name,
            'Predicted Label': prediction[0],
            'Sample Text': text[:100],  # Displaying only the first 100 characters for brevity
            'Confidence Score': confidence_score
        })

    results_df = pd.DataFrame(results)

    st.write("Classification Results")
    st.dataframe(results_df)

In [None]:
import streamlit as st
import re
import joblib
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import fitz  # PyMuPDF

# Load pre-trained models and vectorizers
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
logistic_regression_model_tfidf = joblib.load('logistic_regression_model_tfidf.pkl')
logistic_regression_model_bert = joblib.load('logistic_regression_model_bert.pkl')

# Function to get BERT embeddings
def get_bert_embeddings(text):
    inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
    return embeddings

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

# Streamlit app
st.title('Document Classification')
st.write('Upload multiple PDF documents from different labels to classify their type.')

uploaded_files = st.file_uploader("Choose documents...", type=["pdf"], accept_multiple_files=True)

embedding_choice = st.selectbox(
    'Select embedding method',
    ('TF-IDF', 'BERT')
)

if uploaded_files and len(uploaded_files) > 0:
    st.write("Processing PDFs...")

    results = []

    for uploaded_file in uploaded_files:
        text = extract_text_from_pdf(uploaded_file)

        if embedding_choice == 'TF-IDF':
            features = tfidf_vectorizer.transform([text])
            prediction = logistic_regression_model_tfidf.predict(features)
            confidence_score = logistic_regression_model_tfidf.predict_proba(features).max()
        else:
            features = get_bert_embeddings(text)
            prediction = logistic_regression_model_bert.predict(features)
            confidence_score = logistic_regression_model_bert.predict_proba(features).max()

        results.append({
            'File Name': uploaded_file.name,
            'Predicted Label': prediction[0],
            'Sample Text': text[:100],  # Displaying only the first 100 characters for brevity
            'Confidence Score': confidence_score
        })

    results_df = pd.DataFrame(results)

    st.write("Classification Results")
    st.dataframe(results_df)

In [None]:
import os
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
import joblib

# Download stopwords
nltk.download('stopwords')

# Path to Tesseract OCR executable
pytesseract.pytesseract.tesseract_cmd = 'C:\\Users\\hp\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'

# Define the folders containing the PDF files
pdf_folder_paths = ['Eyewear', 'Jewellery']

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    cleaned_text = ' '.join(words)
    return cleaned_text

# Function to extract text from PDF using OCR
def pdf_to_text(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            text += pytesseract.image_to_string(img)
        return text
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""

# Function to process PDFs in a folder
def process_pdfs(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            text = pdf_to_text(pdf_path)
            data.append({'Filename': filename, 'Extracted Text': text, 'Folder': folder_path})
            print(f"OCR completed for {filename} in folder {folder_path}")
    return data

if __name__ == "__main__":
    all_extracted_data = []
    for folder_path in pdf_folder_paths:
        extracted_data = process_pdfs(folder_path)
        all_extracted_data.extend(extracted_data)
    
    df = pd.DataFrame(all_extracted_data)
    
    # Clean the extracted text
    df['Cleaned Text'] = df['Extracted Text'].apply(clean_text)
    
    # TF-IDF Feature extraction
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_features = tfidf_vectorizer.fit_transform(df['Cleaned Text'])
    
    # Prepare labels
    df['Label'] = df['Folder']
    
    # Transform labels into binary format
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df['Label'].apply(lambda x: [x]))

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(tfidf_features, y, test_size=0.3, random_state=42)
    
    # Logistic Regression Model training
    logistic_regressor = LogisticRegression(solver='liblinear', penalty='l1', C=0.1)
    multi_target_classifier = MultiOutputClassifier(logistic_regressor, n_jobs=None)
    multi_target_classifier.fit(X_train, y_train)
    
    # Evaluate model
    y_pred = multi_target_classifier.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=mlb.classes_))
    
    # Save the TF-IDF vectorizer and model
    joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
    joblib.dump(multi_target_classifier, 'logistic_regression_model_tfidf.pkl')

    print("Model and vectorizer saved successfully.")

In [None]:
import os
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import joblib

# Download stopwords
nltk.download('stopwords')

# Path to Tesseract OCR executable
pytesseract.pytesseract.tesseract_cmd = 'C:\\Users\\hp\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'

# Define the folders containing the PDF files
pdf_folder_paths = ['Eyewear', 'Jewellery']

# Function to clean text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    cleaned_text = ' '.join(words)
    return cleaned_text

# Function to extract text from PDF using OCR
def pdf_to_text(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            text += pytesseract.image_to_string(img)
        return text
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""

# Function to process PDFs in a folder
def process_pdfs(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            text = pdf_to_text(pdf_path)
            data.append({'Filename': filename, 'Extracted Text': text, 'Folder': folder_path})
            print(f"OCR completed for {filename} in folder {folder_path}")
    return data

if __name__ == "__main__":
    all_extracted_data = []
    for folder_path in pdf_folder_paths:
        extracted_data = process_pdfs(folder_path)
        all_extracted_data.extend(extracted_data)
    
    df = pd.DataFrame(all_extracted_data)
    
    # Clean the extracted text
    df['Cleaned Text'] = df['Extracted Text'].apply(clean_text)
    
    # TF-IDF Feature extraction
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=5, max_df=0.7)
    tfidf_features = tfidf_vectorizer.fit_transform(df['Cleaned Text'])
    
    # Prepare labels
    df['Label'] = df['Folder']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(tfidf_features, df['Label'], test_size=0.2, random_state=42)
    
    # Logistic Regression Model with hyperparameter tuning
    pipeline = Pipeline([
        ('svd', TruncatedSVD(n_components=100)),  # Dimensionality reduction
        ('log_reg', LogisticRegression())
    ])
    
    param_grid = {
        'log_reg__C': [0.09],
        'log_reg__solver': ['liblinear']
    }
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    
    # Evaluate model
    y_pred = best_model.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Save the TF-IDF vectorizer and model
    joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
    joblib.dump(best_model, 'logistic_regression_model_tfidf.pkl')

    print("Model and vectorizer saved successfully.")

In [None]:
import os
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
 
# Download stopwords
nltk.download('stopwords')
 
# Path to Tesseract OCR executable
pytesseract.pytesseract.tesseract_cmd = 'C:\\Users\\hp\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'
 
# Define the folders containing the PDF files
pdf_folder_paths = ['Eyewear','FMCG','Footwear','Hardware','IT','Jewellery','Pharma']
 
# Function to clean text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    cleaned_text = ' '.join(words)
    return cleaned_text
 
# Function to extract text from PDF using OCR
def pdf_to_text(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            text += pytesseract.image_to_string(img)
        return text
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""
 
# Function to process PDFs in a folder
def process_pdfs(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            text = pdf_to_text(pdf_path)
            data.append({'Filename': filename, 'Extracted Text': text, 'Folder': folder_path})
            print(f"OCR completed for {filename} in folder {folder_path}")
    return data
 
if __name__ == "__main__":
    all_extracted_data = []
    for folder_path in pdf_folder_paths:
        extracted_data = process_pdfs(folder_path)
        all_extracted_data.extend(extracted_data)
   
    df = pd.DataFrame(all_extracted_data)
   
    # Clean the extracted text
    df['Cleaned Text'] = df['Extracted Text'].apply(clean_text)
   
    # TF-IDF Feature extraction
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_features = tfidf_vectorizer.fit_transform(df['Cleaned Text'])
   
    # Prepare labels
    df['Label'] = df['Folder']
   
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(tfidf_features, df['Label'], test_size=0.2, random_state=42)
   
    # Logistic Regression Model training with L2 regularization
    model_tfidf = LogisticRegression(penalty='l2', C=0.5)
    model_tfidf.fit(X_train, y_train)
   
    # Evaluate model
    y_pred = model_tfidf.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
   
    # Save the TF-IDF vectorizer and model
    joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
    joblib.dump(model_tfidf, 'logistic_regression_model_tfidf.pkl')
 
    print("Model and vectorizer saved successfully.")

In [None]:
import streamlit as st
import re
import joblib
import torch
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import fitz  # PyMuPDF

# Load pre-trained models and vectorizers
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
logistic_regression_model_tfidf = joblib.load('logistic_regression_model_tfidf.pkl')
logistic_regression_model_bert = joblib.load('logistic_regression_model_bert.pkl')

# Function to get BERT embeddings
def get_bert_embeddings(text):
    inputs = bert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()
    return embeddings

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    text = ""
    pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

# Streamlit app
st.title('Document Classification')
st.write('Upload multiple PDF documents from different labels to classify their type.')

uploaded_files = st.file_uploader("Choose documents...", type=["pdf"], accept_multiple_files=True)

embedding_choice = st.selectbox(
    'Select embedding method',
    ('TF-IDF', 'BERT')
)

if uploaded_files and len(uploaded_files) > 0:
    st.write("Processing PDFs...")

    results = []

    for uploaded_file in uploaded_files:
        text = extract_text_from_pdf(uploaded_file)

        if embedding_choice == 'TF-IDF':
            features = tfidf_vectorizer.transform([text])
            prediction = logistic_regression_model_tfidf.predict(features)
            confidence_score = logistic_regression_model_tfidf.predict_proba(features).max()
        else:
            features = get_bert_embeddings(text)
            prediction = logistic_regression_model_bert.predict(features)
            confidence_score = logistic_regression_model_bert.predict_proba(features).max()

        results.append({
            'File Name': uploaded_file.name,
            'Predicted Label': prediction[0],
            'Sample Text': text[:100],  # Displaying only the first 100 characters for brevity
            'Confidence Score': confidence_score
        })

    results_df = pd.DataFrame(results)

    st.write("Classification Results")
    st.dataframe(results_df)
