In [1]:
import pandas as pd
import pytesseract
import cv2
from tensorflow.keras.applications import EfficientNetB0
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import os
import re
from utils import download_images
from constants import entity_unit_map, allowed_units

# Step 1: Downloading Images
def download_all_images(train_csv, download_folder):
    """Downloads all images listed in the CSV to a specific folder."""
    train_df = pd.read_csv(train_csv)
    image_links = train_df['image_link'].tolist()
    download_images(image_links, download_folder)
    return train_df

# Step 2: Preprocessing the Images
def preprocess_image(image_path):
    """Load and preprocess image for input into the CNN model."""
    image = cv2.imread(image_path)
    image = cv2.resize(image, (224, 224))  # Resize to input size of EfficientNet
    image = image / 255.0  # Normalize pixel values
    return image

# Step 3: OCR Text Extraction
def extract_text_from_image(image_path):
    """Use OCR to extract text from images."""
    image = cv2.imread(image_path)
    text = pytesseract.image_to_string(image)
    return text

def extract_entity_value(text):
    """Extract numeric entity values and units using regex."""
    pattern = re.compile(r'(\d+(?:\.\d+)?)\s*([a-zA-Z]+)')
    matches = pattern.findall(text)
    
    if matches:
        value, unit = matches[0]
        # Validate the unit against the allowed units
        if unit.lower() in allowed_units:
            return value, unit.lower()  # Return the value and unit in lowercase
    return None, None

# Step 4: CNN Feature Extraction
def extract_image_features(image_path):
    """Extract CNN features from the image."""
    cnn_model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg')
    image = preprocess_image(image_path)
    image = image.reshape((1, 224, 224, 3))
    features = cnn_model.predict(image)
    return features

# Step 5: Train a Regression Model (for Numeric Predictions)
def train_model(train_df, download_folder):
    """Train a regression model using extracted features from images and text."""
    X_train = []
    y_train = []

    for idx, row in train_df.iterrows():
        image_path = os.path.join(download_folder, os.path.basename(row['image_link']))
        
        # Extract features using CNN
        image_features = extract_image_features(image_path)
        
        # Extract text using OCR
        text = extract_text_from_image(image_path)
        
        # Extract entity value from the text
        value, unit = extract_entity_value(text)
        
        if value and unit:
            # Combine CNN features and OCR features
            combined_features = list(image_features[0]) + [float(value)]
            X_train.append(combined_features)
            y_train.append(float(row['entity_value'].split()[0]))  # Extract ground truth value
    
    # Train a simple regression model
    X_train = pd.DataFrame(X_train)
    reg_model = LinearRegression()
    reg_model.fit(X_train, y_train)
    return reg_model

# Step 6: Predict and Generate Output
def predict_and_generate_output(test_csv, reg_model, download_folder):
    """Generate predictions and save to CSV in the correct format."""
    test_df = pd.read_csv(test_csv)
    X_test = []
    test_indices = []
    predicted_units = []

    for idx, row in test_df.iterrows():
        image_path = os.path.join(download_folder, os.path.basename(row['image_link']))
        
        # Extract features using CNN
        image_features = extract_image_features(image_path)
        
        # Extract text using OCR
        text = extract_text_from_image(image_path)
        
        # Extract entity value from the text
        value, unit = extract_entity_value(text)
        
        if value and unit:
            # Combine CNN features and OCR features
            combined_features = list(image_features[0]) + [float(value)]
            X_test.append(combined_features)
            test_indices.append(row['index'])
            predicted_units.append(unit)
    
    X_test = pd.DataFrame(X_test)
    predictions = reg_model.predict(X_test)

    # Format the predictions
    output = pd.DataFrame({
        'index': test_indices, 
        'prediction': [f"{pred:.2f} {unit}" for pred, unit in zip(predictions, predicted_units)]
    })
    
    output.to_csv('test_out.csv', index=False)
    print("Predictions saved to test_out.csv")

# Step 7: Main Function to Run the Entire Process
def main():
    # Define paths
    train_csv = '/home/rguktrkvalley/Desktop/train1.csv'
    test_csv = '/home/rguktrkvalley/Desktop/sample_test.csv'
    download_folder = '/home/rguktrkvalley/Desktop/downloaded_images'
    
    # Step 1: Download images
    train_df = download_all_images(train_csv, download_folder)

    # Step 2: Train the model
    reg_model = train_model(train_df, download_folder)

    # Step 3: Generate predictions for the test set
    predict_and_generate_output(test_csv, reg_model, download_folder)

# Run the pipeline
if __name__ == '__main__':
    main()


2024-09-15 23:23:16.374823: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-15 23:23:18.642931: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-15 23:23:18.646590: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|███████████████████████████████████████████| 99/99 [01:13<00:00,  1.35it/s]


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5


Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file


error: OpenCV(4.10.0) /io/opencv/modules/imgproc/src/resize.cpp:4152: error: (-215:Assertion failed) !ssize.empty() in function 'resize'


In [16]:
import pandas as pd
import pytesseract
import cv2
from tensorflow.keras.applications import EfficientNetB0
from sklearn.linear_model import LinearRegression
import os
import re
import logging
from utils import download_images
from constants import allowed_units

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Step 1: Downloading Images
def download_all_images(train_csv, download_folder):
    """Downloads all images listed in the CSV to a specific folder."""
    train_df = pd.read_csv(train_csv)
    image_links = train_df['image_link'].tolist()
    download_images(image_links, download_folder)
    return train_df

# Step 2: Preprocessing the Images


# Step 4: Extract Entity Value Using Regex
def extract_entity_value(text):
    """Extract numeric entity values and units using regex."""
    pattern = re.compile(r'(\d+(?:\.\d+)?)\s*([a-zA-Z]+)')
    matches = pattern.findall(text)
    
    if matches:
        value, unit = matches[0]
        # Validate the unit against the allowed units
        if unit.lower() in allowed_units:
            return value, unit.lower()  # Return the value and unit in lowercase
    return None, None

# Step 5: CNN Feature Extraction
def extract_image_features(image_path, cnn_model):
    """Extract CNN features from the image."""
    image = preprocess_image(image_path)
    if image is None:
        return None  # Skip this image if it's invalid
    image = image.reshape((1, 224, 224, 3))
    try:
        features = cnn_model.predict(image)
    except Exception as e:
        logging.warning(f"Error extracting features from image {image_path}: {e}")
        return None
    return features

# Step 6: Train a Regression Model (for Numeric Predictions)
def train_model(train_df, download_folder, cnn_model):
    """Train a regression model using extracted features from images and text."""
    X_train = []
    y_train = []

    for idx, row in train_df.iterrows():
        image_path = os.path.join(download_folder, os.path.basename(row['image_link']))
        
        # Extract features using CNN
        image_features = extract_image_features(image_path, cnn_model)
        if image_features is None:
            logging.warning(f"Skipping image {image_path} due to feature extraction failure.")
            continue  # Skip if image couldn't be processed
        
        # Extract text using OCR
        text = extract_text_from_image(image_path)
        
        # Extract entity value from the text
        value, unit = extract_entity_value(text)
        if not value or not unit:
            logging.warning(f"Skipping image {image_path} due to text extraction failure.")
            continue  # Skip if text extraction fails
        
        # Combine CNN features and OCR features
        combined_features = list(image_features[0]) + [float(value)]
        X_train.append(combined_features)
        y_train.append(float(row['entity_value'].split()[0]))  # Extract ground truth value
    
    # Train a simple regression model
    if not X_train:
        logging.error("No valid data for training.")
        return None
    X_train = pd.DataFrame(X_train)
    reg_model = LinearRegression()
    reg_model.fit(X_train, y_train)
    return reg_model

# Step 7: Predict and Generate Output
def predict_and_generate_output(test_csv, reg_model, download_folder, cnn_model):
    """Generate predictions and save to CSV in the correct format."""
    test_df = pd.read_csv(test_csv)
    X_test = []
    test_indices = []
    predicted_units = []

    for idx, row in test_df.iterrows():
        image_path = os.path.join(download_folder, os.path.basename(row['image_link']))
        
        # Extract features using CNN
        image_features = extract_image_features(image_path, cnn_model)
        if image_features is None:
            logging.warning(f"Skipping image {image_path} due to feature extraction failure.")
            continue  # Skip if image couldn't be processed
        
        # Extract text using OCR
        text = extract_text_from_image(image_path)
        
        # Extract entity value from the text
        value, unit = extract_entity_value(text)
        if not value or not unit:
            logging.warning(f"Skipping image {image_path} due to text extraction failure.")
            continue  # Skip if text extraction fails
        
        # Combine CNN features and OCR features
        combined_features = list(image_features[0]) + [float(value)]
        X_test.append(combined_features)
        test_indices.append(row['index'])
        predicted_units.append(unit)
    
    if not X_test:
        logging.error("No valid data for predictions.")
        return
    X_test = pd.DataFrame(X_test)
    predictions = reg_model.predict(X_test)

    # Format the predictions
    output = pd.DataFrame({
        'index': test_indices, 
        'prediction': [f"{pred:.2f} {unit}" for pred, unit in zip(predictions, predicted_units)]
    })
    
    output.to_csv('test_out.csv', index=False)
    logging.info("Predictions saved to test_out.csv")

# Step 8: Main Function to Run the Entire Process
def main():
    # Define paths
    train_csv = '/home/rguktrkvalley/Desktop/train1.csv'
    test_csv = '/home/rguktrkvalley/Desktop/sample_test.csv'
    download_folder = '/home/rguktrkvalley/Desktop/downloaded_images'
    
    # Load CNN model only once
    cnn_model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg')
    
    # Step 1: Download images
    train_df = download_all_images(train_csv, download_folder)

    # Step 2: Train the model
    reg_model = train_model(train_df, download_folder, cnn_model)

    # Step 3: Generate predictions for the test set
    if reg_model:
        predict_and_generate_output(test_csv, reg_model, download_folder, cnn_model)

# Run the pipeline
if __name__ == '__main__':
    main()


100%|█████████████████████████████████████████| 99/99 [00:00<00:00, 4242.64it/s]




Premature end of JPEG file




















Premature end of JPEG file








Premature end of JPEG file
















Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file












Premature end of JPEG file
Premature end of JPEG file
















Premature end of JPEG file








Premature end of JPEG file




Premature end of JPEG file








Premature end of JPEG file








Premature end of JPEG file




Premature end of JPEG file












Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file








Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




















Premature end of JPEG file
Premature end of JPEG file








Premature end of JPEG file




Premature end of JPEG file




















Premature end of JPEG file




Premature end of JPEG file








Premature end of JPEG file




Premature end of JPEG file








Premature end of JPEG file




































Premature end of JPEG file








Premature end of JPEG file








Premature end of JPEG file












Premature end of JPEG file








Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
































































[ WARN:0@3240.586] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/41-NCxNuBxL.jpg'): can't open/read file: check file path/integrity
[ WARN:0@3240.588] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/41-NCxNuBxL.jpg'): can't open/read file: check file path/integrity
[ WARN:0@3240.593] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/417NJrPEk+L.jpg'): can't open/read file: check file path/integrity
[ WARN:0@3240.599] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/417SThj+SrL.jpg'): can't open/read file: check file path/integrity
[ WARN:0@3240.605] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/417SThj+SrL.jpg'): can't open/read file: check file path/integrity
[ WARN:0@3240.611] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/41ADV