In [4]:
# Import necessary libraries
import pandas as pd
import requests
from PIL import Image
import pytesseract
import cv2
import numpy as np
from io import BytesIO
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Define constants
ALLOWED_UNITS = {'gram', 'centimetre', 'millilitre', 'kilogram', 'millimetre', 'ounce', 'litre'}

# Load datasets
train_data = pd.read_csv('/home/rguktrkvalley/Desktop/train1.csv')
test_data = pd.read_csv('/home/rguktrkvalley/Desktop/sample_test.csv')

# Function to download images
def download_image(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Ensure we notice bad responses
        img = Image.open(BytesIO(response.content))
        return img
    except requests.RequestException as e:
        print(f"Error downloading image: {e}")
        return None
    except IOError as e:
        print(f"Error opening image: {e}")
        return None

# Function to extract text from image
def extract_text_from_image(pil_image):
    try:
        # Convert PIL image to a format OpenCV can work with
        open_cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
        
        # Convert the image to grayscale
        gray_image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
        
        # Apply Gaussian Blur to reduce noise
        blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
        
        # Use adaptive thresholding to binarize the image
        binary_image = cv2.adaptiveThreshold(blurred_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                             cv2.THRESH_BINARY, 11, 2)
        
        # Use Tesseract to extract text from the processed image
        text = pytesseract.image_to_string(binary_image)
        
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return ""

# Function to clean and process extracted text
def clean_extracted_text(text):
    # Regex to find numbers followed by units
    pattern = r'(\d+\.?\d*)\s*(gram|g|cm|centimeter|ml|ounce|kg|kilogram|litre|mm)'
    matches = re.findall(pattern, text.lower())  # Find all matches
    
    cleaned_results = []
    
    for match in matches:
        number = match[0]
        unit = match[1]
        # Map shorthand to allowed unit
        unit_mapping = {
            'g': 'gram',
            'cm': 'centimetre',
            'ml': 'millilitre',
            'kg': 'kilogram',
            'mm': 'millimetre',
            'ounce': 'ounce',
            'litre': 'litre'
        }
        unit = unit_mapping.get(unit, unit)
        if unit in ALLOWED_UNITS:
            cleaned_results.append(f"{number} {unit}")
    
    return cleaned_results[0] if cleaned_results else ""

# Feature extraction and preparation of training data
features = []
labels = []

for idx, row in train_data.iterrows():
    image_url = row['image_link']
    img = download_image(image_url)
    
    if img:
        extracted_text = extract_text_from_image(img)
        cleaned_text = clean_extracted_text(extracted_text)
        features.append(cleaned_text)  # Store extracted text as feature
        labels.append(row['entity_value'])  # Store the actual entity value

# Convert features and labels to DataFrame
features_df = pd.DataFrame(features, columns=['extracted_text'])
labels_df = pd.DataFrame(labels, columns=['entity_value'])

# Convert text labels to numerical values for classification
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
labels_encoded = le.fit_transform(labels_df['entity_value'])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features_df['extracted_text'], labels_encoded, test_size=0.2, random_state=42)

# Vectorize the text data
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)

# Train the Decision Tree Classifier
model = DecisionTreeClassifier()
model.fit(X_train_vectorized, y_train)

# Evaluate the model
y_pred = model.predict(X_val_vectorized)
print(classification_report(y_val, y_pred, target_names=le.classes_))

# Main processing loop for the test dataset
predictions = []
for idx, row in test_data.iterrows():
    image_url = row['image_link']
    img = download_image(image_url)
    
    if img:
        extracted_text = extract_text_from_image(img)
        cleaned_text = clean_extracted_text(extracted_text)
        
        # Vectorize the cleaned text for prediction
        cleaned_text_vectorized = vectorizer.transform([cleaned_text])
        prediction_encoded = model.predict(cleaned_text_vectorized)
        
        # Decode the prediction back to the original label
        prediction = le.inverse_transform(prediction_encoded)[0]
        
        # Append the prediction to the list
        predictions.append({"index": row['index'], "prediction": prediction})
    else:
        predictions.append({"index": row['index'], "prediction": ""})

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions)

# Save the predictions to CSV in the required format
output_file = '/home/rguktrkvalley/Desktop/test_out2.csv'
predictions_df.to_csv(output_file, index=False)

print(f'Predictions saved to {output_file}')


ValueError: Number of classes, 21, does not match size of target_names, 65. Try specifying the labels parameter

In [5]:
# Import necessary libraries
import pandas as pd
import requests
from PIL import Image
import pytesseract
import cv2
import numpy as np
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from io import BytesIO

# Define constants
ALLOWED_UNITS = {'gram', 'centimetre', 'millilitre', 'kilogram', 'millimetre', 'ounce', 'litre'}

# Load datasets
train_data = pd.read_csv('/home/rguktrkvalley/Desktop/train1.csv') 
test_data = pd.read_csv('/home/rguktrkvalley/Desktop/sample_test.csv')   

# Function to download images
def download_image(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Ensure we notice bad responses
        img = Image.open(BytesIO(response.content))
        return img
    except requests.RequestException as e:
        print(f"Error downloading image: {e}")
        return None
    except IOError as e:
        print(f"Error opening image: {e}")
        return None

# Function to extract text from image
def extract_text_from_image(pil_image):
    try:
        # Convert PIL image to a format OpenCV can work with
        open_cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
        
        # Convert the image to grayscale
        gray_image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
        
        # Apply Gaussian Blur to reduce noise
        blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
        
        # Use adaptive thresholding to binarize the image
        binary_image = cv2.adaptiveThreshold(blurred_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                             cv2.THRESH_BINARY, 11, 2)
        
        # Use Tesseract to extract text from the processed image
        text = pytesseract.image_to_string(binary_image)
        
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return ""

# Function to clean and process extracted text
def clean_extracted_text(text):
    # Regex to find numbers followed by units
    pattern = r'(\d+\.?\d*)\s*(gram|g|cm|centimeter|ml|ounce|kg|kilogram|litre|mm)'
    matches = re.findall(pattern, text.lower())  # Find all matches
    
    cleaned_results = []
    
    for match in matches:
        number = match[0]
        unit = match[1]
        # Map shorthand to allowed unit
        unit_mapping = {
            'g': 'gram',
            'cm': 'centimetre',
            'ml': 'millilitre',
            'kg': 'kilogram',
            'mm': 'millimetre',
            'ounce': 'ounce',
            'litre': 'litre'
        }
        unit = unit_mapping.get(unit, unit)
        if unit in ALLOWED_UNITS:
            cleaned_results.append(f"{number} {unit}")
    
    return cleaned_results[0] if cleaned_results else ""

# Feature extraction and preparation of training data
features = []
labels = []

for idx, row in train_data.iterrows():
    image_url = row['image_link']
    img = download_image(image_url)
    
    if img:
        extracted_text = extract_text_from_image(img)
        cleaned_text = clean_extracted_text(extracted_text)
        features.append(cleaned_text)  # Store extracted text as feature
        labels.append(row['entity_value'])  # Store the actual entity value

# Convert features and labels to DataFrame
features_df = pd.DataFrame(features, columns=['extracted_text'])
labels_df = pd.DataFrame(labels, columns=['entity_value'])

# Convert text labels to numerical values for classification
le = LabelEncoder()
labels_encoded = le.fit_transform(labels_df['entity_value'])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features_df['extracted_text'], labels_encoded, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)

# Train the Decision Tree Classifier
model = DecisionTreeClassifier()
model.fit(X_train_vectorized, y_train)

# Evaluate the model
y_pred = model.predict(X_val_vectorized)
print(classification_report(y_val, y_pred, target_names=le.classes_))

# Main processing loop for the test dataset
predictions = []
for idx, row in test_data.iterrows():
    image_url = row['image_link']
    img = download_image(image_url)
    
    if img:
        extracted_text = extract_text_from_image(img)
        cleaned_text = clean_extracted_text(extracted_text)
        
        # Vectorize the cleaned text for prediction
        cleaned_text_vectorized = vectorizer.transform([cleaned_text])
        prediction_encoded = model.predict(cleaned_text_vectorized)
        
        # Decode the prediction back to the original label
        prediction = le.inverse_transform(prediction_encoded)[0]
        
        # Append the prediction to the list
        predictions.append({"index": row['index'], "prediction": prediction})
    else:
        predictions.append({"index": row['index'], "prediction": ""})

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions)

# Save the predictions to CSV in the required format
output_file = 'test_out.csv'  # Update with the correct path
predictions_df.to_csv(output_file, index=False)

print(f'Predictions saved to {output_file}')


ValueError: Number of classes, 21, does not match size of target_names, 65. Try specifying the labels parameter

In [1]:
# Import necessary libraries
import pandas as pd
import requests
from PIL import Image
import pytesseract
import cv2
import numpy as np
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from io import BytesIO

# Define constants
ALLOWED_UNITS = {'gram', 'centimetre', 'millilitre', 'kilogram', 'millimetre', 'ounce', 'litre'}

# Load datasets
train_data = pd.read_csv('/home/rguktrkvalley/Desktop/train1.csv')
test_data = pd.read_csv('/home/rguktrkvalley/Desktop/sample_test.csv')

# Function to download images
def download_image(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Ensure we notice bad responses
        img = Image.open(BytesIO(response.content))
        return img
    except requests.RequestException as e:
        print(f"Error downloading image: {e}")
        return None
    except IOError as e:
        print(f"Error opening image: {e}")
        return None

# Function to extract text from image
def extract_text_from_image(pil_image):
    try:
        # Convert PIL image to a format OpenCV can work with
        open_cv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
        
        # Convert the image to grayscale
        gray_image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
        
        # Apply Gaussian Blur to reduce noise
        blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
        
        # Use adaptive thresholding to binarize the image
        binary_image = cv2.adaptiveThreshold(blurred_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                             cv2.THRESH_BINARY, 11, 2)
        
        # Use Tesseract to extract text from the processed image
        text = pytesseract.image_to_string(binary_image)
        
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return ""

# Function to clean and process extracted text
def clean_extracted_text(text):
    # Regex to find numbers followed by units
    pattern = r'(\d+\.?\d*)\s*(gram|g|cm|centimeter|ml|ounce|kg|kilogram|litre|mm)'
    matches = re.findall(pattern, text.lower())  # Find all matches
    
    cleaned_results = []
    
    for match in matches:
        number = match[0]
        unit = match[1]
        # Map shorthand to allowed unit
        unit_mapping = {
            'g': 'gram',
            'cm': 'centimetre',
            'ml': 'millilitre',
            'kg': 'kilogram',
            'mm': 'millimetre',
            'ounce': 'ounce',
            'litre': 'litre'
        }
        unit = unit_mapping.get(unit, unit)
        if unit in ALLOWED_UNITS:
            cleaned_results.append(f"{number} {unit}")
    
    return cleaned_results[0] if cleaned_results else ""

# Feature extraction and preparation of training data
features = []
labels = []

for idx, row in train_data.iterrows():
    image_url = row['image_link']
    img = download_image(image_url)
    
    if img:
        extracted_text = extract_text_from_image(img)
        cleaned_text = clean_extracted_text(extracted_text)
        features.append(cleaned_text)  # Store extracted text as feature
        labels.append(row['entity_value'])  # Store the actual entity value

# Convert features and labels to DataFrame
features_df = pd.DataFrame(features, columns=['extracted_text'])
labels_df = pd.DataFrame(labels, columns=['entity_value'])

# Convert text labels to numerical values for classification
le = LabelEncoder()
labels_encoded = le.fit_transform(labels_df['entity_value'])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features_df['extracted_text'], labels_encoded, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)

# Train the Decision Tree Classifier
model = DecisionTreeClassifier()
model.fit(X_train_vectorized, y_train)

# Evaluate the model
y_pred = model.predict(X_val_vectorized)

# Ensure y_val and y_pred are not empty and have matching classes
if len(y_val) > 0 and len(y_pred) > 0:
    print(classification_report(y_val, y_pred, target_names=le.classes_))
else:
    print("No predictions or no validation labels available.")

# Main processing loop for the test dataset
predictions = []
for idx, row in test_data.iterrows():
    image_url = row['image_link']
    img = download_image(image_url)
    
    if img:
        extracted_text = extract_text_from_image(img)
        cleaned_text = clean_extracted_text(extracted_text)
        
        # Vectorize the cleaned text for prediction
        cleaned_text_vectorized = vectorizer.transform([cleaned_text])
        prediction_encoded = model.predict(cleaned_text_vectorized)
        
        # Decode the prediction back to the original label
        prediction = le.inverse_transform(prediction_encoded)[0]
        
        # Append the prediction to the list
        predictions.append({"index": row['index'], "prediction": prediction})
    else:
        predictions.append({"index": row['index'], "prediction": ""})

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions)

# Save the predictions to CSV in the required format
output_file = 'test_out.csv'  # Update with the correct path
predictions_df.to_csv(output_file, index=False)

print(f'Predictions saved to {output_file}')


ValueError: Number of classes, 21, does not match size of target_names, 65. Try specifying the labels parameter

In [None]:
import pytesseract
from PIL import Image
import pandas as pd
import re
from transformers import pipeline
import easyocr
import requests
from io import BytesIO

# Function to download the image
def download_image(image_url):
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))
    return img

# Function to extract text from image using EasyOCR
def extract_text_from_image(image):
    reader = easyocr.Reader(['en'])  # Load with English language
    result = reader.readtext(image, detail=0)
    return " ".join(result)

# Pre-trained BERT-based NER pipeline
ner_model = pipeline("ner", grouped_entities=True)

# Function to extract entities using the NER model
def extract_entities(text):
    ner_results = ner_model(text)
    entities = {}
    for entity in ner_results:
        entity_text = entity['word']
        entity_group = entity['entity_group']
        if entity_group in ["MISC", "QUANTITY"]:
            entities[entity_text] = entity_group
    return entities

# Function to preprocess text and append correct units
def preprocess_text(text, entity_name):
    value = re.findall(r"[-+]?\d*\.\d+|\d+", text)
    if not value:
        return ""
    value = float(value[0])
    if "weight" in entity_name.lower():
        return f"{value} gram"
    elif "dimension" in entity_name.lower():
        return f"{value} cm"
    elif "volume" in entity_name.lower():
        return f"{value} litre"
    elif "voltage" in entity_name.lower():
        return f"{value} volt"
    elif "wattage" in entity_name.lower():
        return f"{value} watt"
    return ""

# Function to predict entity value for each image
def predict_entity_value(entity_name, image_url):
    image = download_image(image_url)
    extracted_text = extract_text_from_image(image)
    entities = extract_entities(extracted_text)
    predicted_value = preprocess_text(extracted_text, entity_name)
    return predicted_value

# Load the test data
test_data = pd.read_csv('dataset/test.csv')

# List to store predictions
predictions = []

# Iterate through test samples and make predictions
for index, row in test_data.iterrows():
    image_url = row['image_link']
    entity_name = row['entity_name']
    predicted_value = predict_entity_value(entity_name, image_url)
    predictions.append((row['index'], predicted_value))

# Create output DataFrame and save to CSV
output_df = pd.DataFrame(predictions, columns=["index", "prediction"])
output_df.to_csv('test_out.csv', index=False)
print("complete")

2024-09-15 18:11:17.453807: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-15 18:11:19.754773: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
!pip install easyocr


Collecting easyocr
  Using cached easyocr-1.7.1-py3-none-any.whl.metadata (11 kB)
Collecting opencv-python-headless (from easyocr)
  Using cached opencv_python_headless-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting scikit-image (from easyocr)
  Using cached scikit_image-0.21.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting python-bidi (from easyocr)
  Using cached python_bidi-0.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting Shapely (from easyocr)
  Using cached shapely-2.0.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting pyclipper (from easyocr)
  Using cached pyclipper-1.3.0.post5-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Using cached ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)
Collecting typing-extensions>=4.8.0

In [6]:
!pip install transformers


