In [1]:
import pandas as pd
import pytesseract
import cv2
from tensorflow.keras.applications import EfficientNetB0
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import os
import re
from utils import download_images
from constants import entity_unit_map, allowed_units

# Step 1: Downloading Images
def download_all_images(train_csv, download_folder):
    """Downloads all images listed in the CSV to a specific folder."""
    train_df = pd.read_csv(train_csv)
    image_links = train_df['image_link'].tolist()
    download_images(image_links, download_folder)
    return train_df

# Step 2: Preprocessing the Images
def preprocess_image(image_path):
    """Load and preprocess image for input into the CNN model."""
    image = cv2.imread(image_path)
    image = cv2.resize(image, (224, 224))  # Resize to input size of EfficientNet
    image = image / 255.0  # Normalize pixel values
    return image

# Step 3: OCR Text Extraction
def extract_text_from_image(image_path):
    """Use OCR to extract text from images."""
    image = cv2.imread(image_path)
    text = pytesseract.image_to_string(image)
    return text

def extract_entity_value(text):
    """Extract numeric entity values and units using regex."""
    pattern = re.compile(r'(\d+(?:\.\d+)?)\s*([a-zA-Z]+)')
    matches = pattern.findall(text)
    
    if matches:
        value, unit = matches[0]
        # Validate the unit against the allowed units
        if unit.lower() in allowed_units:
            return value, unit.lower()  # Return the value and unit in lowercase
    return None, None

# Step 4: CNN Feature Extraction
def extract_image_features(image_path):
    """Extract CNN features from the image."""
    cnn_model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg')
    image = preprocess_image(image_path)
    image = image.reshape((1, 224, 224, 3))
    features = cnn_model.predict(image)
    return features

# Step 5: Train a Regression Model (for Numeric Predictions)
def train_model(train_df, download_folder):
    """Train a regression model using extracted features from images and text."""
    X_train = []
    y_train = []

    for idx, row in train_df.iterrows():
        image_path = os.path.join(download_folder, os.path.basename(row['image_link']))
        
        # Extract features using CNN
        image_features = extract_image_features(image_path)
        
        # Extract text using OCR
        text = extract_text_from_image(image_path)
        
        # Extract entity value from the text
        value, unit = extract_entity_value(text)
        
        if value and unit:
            # Combine CNN features and OCR features
            combined_features = list(image_features[0]) + [float(value)]
            X_train.append(combined_features)
            y_train.append(float(row['entity_value'].split()[0]))  # Extract ground truth value
    
    # Train a simple regression model
    X_train = pd.DataFrame(X_train)
    reg_model = LinearRegression()
    reg_model.fit(X_train, y_train)
    return reg_model

# Step 6: Predict and Generate Output
def predict_and_generate_output(test_csv, reg_model, download_folder):
    """Generate predictions and save to CSV in the correct format."""
    test_df = pd.read_csv(test_csv)
    X_test = []
    test_indices = []
    predicted_units = []

    for idx, row in test_df.iterrows():
        image_path = os.path.join(download_folder, os.path.basename(row['image_link']))
        
        # Extract features using CNN
        image_features = extract_image_features(image_path)
        
        # Extract text using OCR
        text = extract_text_from_image(image_path)
        
        # Extract entity value from the text
        value, unit = extract_entity_value(text)
        
        if value and unit:
            # Combine CNN features and OCR features
            combined_features = list(image_features[0]) + [float(value)]
            X_test.append(combined_features)
            test_indices.append(row['index'])
            predicted_units.append(unit)
    
    X_test = pd.DataFrame(X_test)
    predictions = reg_model.predict(X_test)

    # Format the predictions
    output = pd.DataFrame({
        'index': test_indices, 
        'prediction': [f"{pred:.2f} {unit}" for pred, unit in zip(predictions, predicted_units)]
    })
    
    output.to_csv('test_out.csv', index=False)
    print("Predictions saved to test_out.csv")

# Step 7: Main Function to Run the Entire Process
def main():
    # Define paths
    train_csv = '/home/rguktrkvalley/Desktop/train1.csv'
    test_csv = '/home/rguktrkvalley/Desktop/sample_test.csv'
    download_folder = '/home/rguktrkvalley/Desktop/downloaded_images'
    
    # Step 1: Download images
    train_df = download_all_images(train_csv, download_folder)

    # Step 2: Train the model
    reg_model = train_model(train_df, download_folder)

    # Step 3: Generate predictions for the test set
    predict_and_generate_output(test_csv, reg_model, download_folder)

# Run the pipeline
if __name__ == '__main__':
    main()


2024-09-15 23:23:16.374823: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-15 23:23:18.642931: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-15 23:23:18.646590: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|███████████████████████████████████████████| 99/99 [01:13<00:00,  1.35it/s]


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5


Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file


error: OpenCV(4.10.0) /io/opencv/modules/imgproc/src/resize.cpp:4152: error: (-215:Assertion failed) !ssize.empty() in function 'resize'


In [3]:
import pandas as pd
import pytesseract
import cv2
from tensorflow.keras.applications import EfficientNetB0
from sklearn.linear_model import LinearRegression
import os
import re
import logging
from utils import download_images
from constants import allowed_units

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Step 1: Downloading Images
def download_all_images(train_csv, download_folder):
    """Downloads all images listed in the CSV to a specific folder."""
    train_df = pd.read_csv(train_csv)
    image_links = train_df['image_link'].tolist()
    download_images(image_links, download_folder)
    return train_df

# Step 2: Preprocessing the Images
def preprocess_image(image_path):
    """Load and preprocess image for input into the CNN model."""
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Image at {image_path} could not be loaded.")
        image = cv2.resize(image, (224, 224))  # Resize to input size of EfficientNet
        image = image / 255.0  # Normalize pixel values
    except Exception as e:
        logging.warning(f"Error processing image {image_path}: {e}")
        return None
    return image

# Step 3: OCR Text Extraction
def extract_text_from_image(image_path):
    """Use OCR to extract text from images."""
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Image at {image_path} could not be loaded.")
        text = pytesseract.image_to_string(image)
    except Exception as e:
        logging.warning(f"Error performing OCR on image {image_path}: {e}")
        return ""
    return text

# Step 4: Extract Entity Value Using Regex
def extract_entity_value(text):
    """Extract numeric entity values and units using regex."""
    pattern = re.compile(r'(\d+(?:\.\d+)?)\s*([a-zA-Z]+)')
    matches = pattern.findall(text)
    
    if matches:
        value, unit = matches[0]
        # Validate the unit against the allowed units
        if unit.lower() in allowed_units:
            return value, unit.lower()  # Return the value and unit in lowercase
    return None, None

# Step 5: CNN Feature Extraction
def extract_image_features(image_path, cnn_model):
    """Extract CNN features from the image."""
    image = preprocess_image(image_path)
    if image is None:
        return None  # Skip this image if it's invalid
    image = image.reshape((1, 224, 224, 3))
    try:
        features = cnn_model.predict(image)
    except Exception as e:
        logging.warning(f"Error extracting features from image {image_path}: {e}")
        return None
    return features

# Step 6: Train a Regression Model (for Numeric Predictions)
def train_model(train_df, download_folder, cnn_model):
    """Train a regression model using extracted features from images and text."""
    X_train = []
    y_train = []

    for idx, row in train_df.iterrows():
        image_path = os.path.join(download_folder, os.path.basename(row['image_link']))
        
        # Extract features using CNN
        image_features = extract_image_features(image_path, cnn_model)
        if image_features is None:
            continue  # Skip if image couldn't be processed
        
        # Extract text using OCR
        text = extract_text_from_image(image_path)
        
        # Extract entity value from the text
        value, unit = extract_entity_value(text)
        if not value or not unit:
            continue  # Skip if text extraction fails
        
        # Combine CNN features and OCR features
        combined_features = list(image_features[0]) + [float(value)]
        X_train.append(combined_features)
        y_train.append(float(row['entity_value'].split()[0]))  # Extract ground truth value
    
    # Train a simple regression model
    if not X_train:
        logging.error("No valid data for training.")
        return None
    X_train = pd.DataFrame(X_train)
    reg_model = LinearRegression()
    reg_model.fit(X_train, y_train)
    return reg_model

# Step 7: Predict and Generate Output
def predict_and_generate_output(test_csv, reg_model, download_folder, cnn_model):
    """Generate predictions and save to CSV in the correct format."""
    test_df = pd.read_csv(test_csv)
    X_test = []
    test_indices = []
    predicted_units = []

    for idx, row in test_df.iterrows():
        image_path = os.path.join(download_folder, os.path.basename(row['image_link']))
        
        # Extract features using CNN
        image_features = extract_image_features(image_path, cnn_model)
        if image_features is None:
            continue  # Skip if image couldn't be processed
        
        # Extract text using OCR
        text = extract_text_from_image(image_path)
        
        # Extract entity value from the text
        value, unit = extract_entity_value(text)
        if not value or not unit:
            continue  # Skip if text extraction fails
        
        # Combine CNN features and OCR features
        combined_features = list(image_features[0]) + [float(value)]
        X_test.append(combined_features)
        test_indices.append(row['index'])
        predicted_units.append(unit)
    
    if not X_test:
        logging.error("No valid data for predictions.")
        return
    X_test = pd.DataFrame(X_test)
    predictions = reg_model.predict(X_test)

    # Format the predictions
    output = pd.DataFrame({
        'index': test_indices, 
        'prediction': [f"{pred:.2f} {unit}" for pred, unit in zip(predictions, predicted_units)]
    })
    
    output.to_csv('/home/rguktrkvalley/Desktop/ptest_out.csv', index=False)
    logging.info("Predictions saved to test_out.csv")

# Step 8: Main Function to Run the Entire Process
def main():
    # Define paths
    train_csv = '/home/rguktrkvalley/Desktop/train1.csv'
    test_csv = '/home/rguktrkvalley/Desktop/sample_test.csv'
    download_folder = '/home/rguktrkvalley/Desktop/downloaded_images'
    
    # Load CNN model only once
    cnn_model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg')
    
    # Step 1: Download images
    train_df = download_all_images(train_csv, download_folder)

    # Step 2: Train the model
    reg_model = train_model(train_df, download_folder, cnn_model)

    # Step 3: Generate predictions for the test set
    if reg_model:
        predict_and_generate_output(test_csv, reg_model, download_folder, cnn_model)

# Run the pipeline
if __name__ == '__main__':
    main()


100%|█████████████████████████████████████████| 99/99 [00:00<00:00, 7076.40it/s]




2024-09-16 00:58:49,347 - DEBUG - ['tesseract', '/tmp/tess_jk4e5u_x_input.PNG', '/tmp/tess_jk4e5u_x', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 00:58:50,958 - DEBUG - ['tesseract', '/tmp/tess_5dlt5fz3_input.PNG', '/tmp/tess_5dlt5fz3', 'txt']




2024-09-16 00:58:51,949 - DEBUG - ['tesseract', '/tmp/tess_qlyyz08g_input.PNG', '/tmp/tess_qlyyz08g', 'txt']




2024-09-16 00:58:53,672 - DEBUG - ['tesseract', '/tmp/tess_6ohd2dhn_input.PNG', '/tmp/tess_6ohd2dhn', 'txt']




2024-09-16 00:58:54,635 - DEBUG - ['tesseract', '/tmp/tess_qhbc25vf_input.PNG', '/tmp/tess_qhbc25vf', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 00:58:55,761 - DEBUG - ['tesseract', '/tmp/tess_8w2bt9iz_input.PNG', '/tmp/tess_8w2bt9iz', 'txt']




2024-09-16 00:58:57,821 - DEBUG - ['tesseract', '/tmp/tess_l2vt4dul_input.PNG', '/tmp/tess_l2vt4dul', 'txt']




2024-09-16 00:59:01,332 - DEBUG - ['tesseract', '/tmp/tess_v_td46k0_input.PNG', '/tmp/tess_v_td46k0', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 00:59:03,926 - DEBUG - ['tesseract', '/tmp/tess_siab8tdc_input.PNG', '/tmp/tess_siab8tdc', 'txt']




2024-09-16 00:59:05,736 - DEBUG - ['tesseract', '/tmp/tess_uzn59n2w_input.PNG', '/tmp/tess_uzn59n2w', 'txt']




2024-09-16 00:59:07,841 - DEBUG - ['tesseract', '/tmp/tess_xnn3yb74_input.PNG', '/tmp/tess_xnn3yb74', 'txt']




2024-09-16 00:59:09,291 - DEBUG - ['tesseract', '/tmp/tess_pymoebzc_input.PNG', '/tmp/tess_pymoebzc', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 00:59:10,475 - DEBUG - ['tesseract', '/tmp/tess_q25477b2_input.PNG', '/tmp/tess_q25477b2', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 00:59:12,475 - DEBUG - ['tesseract', '/tmp/tess_176n_3r4_input.PNG', '/tmp/tess_176n_3r4', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 00:59:15,355 - DEBUG - ['tesseract', '/tmp/tess_r9q9af6e_input.PNG', '/tmp/tess_r9q9af6e', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 00:59:21,271 - DEBUG - ['tesseract', '/tmp/tess_dmoald17_input.PNG', '/tmp/tess_dmoald17', 'txt']




2024-09-16 00:59:24,453 - DEBUG - ['tesseract', '/tmp/tess_cqd6a_eg_input.PNG', '/tmp/tess_cqd6a_eg', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 00:59:25,939 - DEBUG - ['tesseract', '/tmp/tess_h643i05j_input.PNG', '/tmp/tess_h643i05j', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 00:59:27,064 - DEBUG - ['tesseract', '/tmp/tess_nvl8egty_input.PNG', '/tmp/tess_nvl8egty', 'txt']




2024-09-16 00:59:29,591 - DEBUG - ['tesseract', '/tmp/tess_nvv9k2wr_input.PNG', '/tmp/tess_nvv9k2wr', 'txt']




2024-09-16 00:59:30,911 - DEBUG - ['tesseract', '/tmp/tess_csfhcole_input.PNG', '/tmp/tess_csfhcole', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 00:59:31,657 - DEBUG - ['tesseract', '/tmp/tess_5_r0t1wn_input.PNG', '/tmp/tess_5_r0t1wn', 'txt']




2024-09-16 00:59:33,637 - DEBUG - ['tesseract', '/tmp/tess_gqa1izeo_input.PNG', '/tmp/tess_gqa1izeo', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 00:59:35,787 - DEBUG - ['tesseract', '/tmp/tess_mitvx2gn_input.PNG', '/tmp/tess_mitvx2gn', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 00:59:37,859 - DEBUG - ['tesseract', '/tmp/tess_741mlwhu_input.PNG', '/tmp/tess_741mlwhu', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 00:59:39,566 - DEBUG - ['tesseract', '/tmp/tess_4hqucslp_input.PNG', '/tmp/tess_4hqucslp', 'txt']




2024-09-16 00:59:41,135 - DEBUG - ['tesseract', '/tmp/tess_x7a4st7u_input.PNG', '/tmp/tess_x7a4st7u', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 00:59:42,321 - DEBUG - ['tesseract', '/tmp/tess_coa97cbp_input.PNG', '/tmp/tess_coa97cbp', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 00:59:43,130 - DEBUG - ['tesseract', '/tmp/tess_7mnjonr6_input.PNG', '/tmp/tess_7mnjonr6', 'txt']




2024-09-16 00:59:44,714 - DEBUG - ['tesseract', '/tmp/tess_6szgkgpg_input.PNG', '/tmp/tess_6szgkgpg', 'txt']




2024-09-16 00:59:51,585 - DEBUG - ['tesseract', '/tmp/tess_ofas4i5n_input.PNG', '/tmp/tess_ofas4i5n', 'txt']




2024-09-16 00:59:55,381 - DEBUG - ['tesseract', '/tmp/tess_wwmqjfsm_input.PNG', '/tmp/tess_wwmqjfsm', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 00:59:58,560 - DEBUG - ['tesseract', '/tmp/tess_bwf2z0k8_input.PNG', '/tmp/tess_bwf2z0k8', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 01:00:00,709 - DEBUG - ['tesseract', '/tmp/tess_x75cw7t1_input.PNG', '/tmp/tess_x75cw7t1', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 01:00:01,701 - DEBUG - ['tesseract', '/tmp/tess_a7vi8bss_input.PNG', '/tmp/tess_a7vi8bss', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 01:00:02,515 - DEBUG - ['tesseract', '/tmp/tess_jvrlejta_input.PNG', '/tmp/tess_jvrlejta', 'txt']




2024-09-16 01:00:03,389 - DEBUG - ['tesseract', '/tmp/tess_yjiqm3w3_input.PNG', '/tmp/tess_yjiqm3w3', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 01:00:05,970 - DEBUG - ['tesseract', '/tmp/tess_umno380p_input.PNG', '/tmp/tess_umno380p', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 01:00:07,536 - DEBUG - ['tesseract', '/tmp/tess_2w4spkiy_input.PNG', '/tmp/tess_2w4spkiy', 'txt']




2024-09-16 01:00:09,710 - DEBUG - ['tesseract', '/tmp/tess_4wuorihb_input.PNG', '/tmp/tess_4wuorihb', 'txt']




2024-09-16 01:00:13,540 - DEBUG - ['tesseract', '/tmp/tess_x5amdpq__input.PNG', '/tmp/tess_x5amdpq_', 'txt']




2024-09-16 01:00:14,598 - DEBUG - ['tesseract', '/tmp/tess_yi_pb1s0_input.PNG', '/tmp/tess_yi_pb1s0', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 01:00:17,745 - DEBUG - ['tesseract', '/tmp/tess_ic0rg1it_input.PNG', '/tmp/tess_ic0rg1it', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 01:00:19,496 - DEBUG - ['tesseract', '/tmp/tess_qf0r3znm_input.PNG', '/tmp/tess_qf0r3znm', 'txt']




2024-09-16 01:00:20,855 - DEBUG - ['tesseract', '/tmp/tess_ibbd7x7t_input.PNG', '/tmp/tess_ibbd7x7t', 'txt']




Premature end of JPEG file




Premature end of JPEG file
2024-09-16 01:00:22,888 - DEBUG - ['tesseract', '/tmp/tess_bk1x2rdp_input.PNG', '/tmp/tess_bk1x2rdp', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 01:00:25,496 - DEBUG - ['tesseract', '/tmp/tess_i_bm638d_input.PNG', '/tmp/tess_i_bm638d', 'txt']




2024-09-16 01:00:28,199 - DEBUG - ['tesseract', '/tmp/tess_8ycitnm2_input.PNG', '/tmp/tess_8ycitnm2', 'txt']




2024-09-16 01:00:29,339 - DEBUG - ['tesseract', '/tmp/tess_fu4__8ad_input.PNG', '/tmp/tess_fu4__8ad', 'txt']




2024-09-16 01:00:30,923 - DEBUG - ['tesseract', '/tmp/tess_jn485vmo_input.PNG', '/tmp/tess_jn485vmo', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 01:00:32,837 - DEBUG - ['tesseract', '/tmp/tess_7o9acy6r_input.PNG', '/tmp/tess_7o9acy6r', 'txt']
Premature end of JPEG file




Premature end of JPEG file
2024-09-16 01:00:34,159 - DEBUG - ['tesseract', '/tmp/tess_gcm38p9m_input.PNG', '/tmp/tess_gcm38p9m', 'txt']




2024-09-16 01:00:35,568 - DEBUG - ['tesseract', '/tmp/tess_n8oislc3_input.PNG', '/tmp/tess_n8oislc3', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 01:00:37,159 - DEBUG - ['tesseract', '/tmp/tess_ay2uph35_input.PNG', '/tmp/tess_ay2uph35', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 01:00:38,538 - DEBUG - ['tesseract', '/tmp/tess_6lmoeajn_input.PNG', '/tmp/tess_6lmoeajn', 'txt']




2024-09-16 01:00:40,017 - DEBUG - ['tesseract', '/tmp/tess_i4ojax4s_input.PNG', '/tmp/tess_i4ojax4s', 'txt']




2024-09-16 01:00:42,329 - DEBUG - ['tesseract', '/tmp/tess_eokqcpyb_input.PNG', '/tmp/tess_eokqcpyb', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 01:00:44,311 - DEBUG - ['tesseract', '/tmp/tess_gjwspdla_input.PNG', '/tmp/tess_gjwspdla', 'txt']




2024-09-16 01:00:47,371 - DEBUG - ['tesseract', '/tmp/tess_t949wk0j_input.PNG', '/tmp/tess_t949wk0j', 'txt']




2024-09-16 01:00:48,520 - DEBUG - ['tesseract', '/tmp/tess_iv73or2a_input.PNG', '/tmp/tess_iv73or2a', 'txt']




2024-09-16 01:00:50,435 - DEBUG - ['tesseract', '/tmp/tess_pvxhjbfq_input.PNG', '/tmp/tess_pvxhjbfq', 'txt']




2024-09-16 01:00:51,733 - DEBUG - ['tesseract', '/tmp/tess_ypcn3ct2_input.PNG', '/tmp/tess_ypcn3ct2', 'txt']




2024-09-16 01:00:53,924 - DEBUG - ['tesseract', '/tmp/tess_zpsvilbh_input.PNG', '/tmp/tess_zpsvilbh', 'txt']




2024-09-16 01:00:55,618 - DEBUG - ['tesseract', '/tmp/tess_bymzdjg8_input.PNG', '/tmp/tess_bymzdjg8', 'txt']




2024-09-16 01:00:56,830 - DEBUG - ['tesseract', '/tmp/tess_s83or_ua_input.PNG', '/tmp/tess_s83or_ua', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 01:00:57,923 - DEBUG - ['tesseract', '/tmp/tess_sndig836_input.PNG', '/tmp/tess_sndig836', 'txt']




2024-09-16 01:00:59,083 - DEBUG - ['tesseract', '/tmp/tess_5xhzw431_input.PNG', '/tmp/tess_5xhzw431', 'txt']




2024-09-16 01:00:59,564 - DEBUG - ['tesseract', '/tmp/tess_7lt1s9kv_input.PNG', '/tmp/tess_7lt1s9kv', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 01:01:00,486 - DEBUG - ['tesseract', '/tmp/tess_qd61kxaj_input.PNG', '/tmp/tess_qd61kxaj', 'txt']




2024-09-16 01:01:01,386 - DEBUG - ['tesseract', '/tmp/tess_lpva4ghp_input.PNG', '/tmp/tess_lpva4ghp', 'txt']




Premature end of JPEG file




Premature end of JPEG file
2024-09-16 01:01:02,562 - DEBUG - ['tesseract', '/tmp/tess_v5ijuq_1_input.PNG', '/tmp/tess_v5ijuq_1', 'txt']




2024-09-16 01:01:03,958 - DEBUG - ['tesseract', '/tmp/tess_4olshy9a_input.PNG', '/tmp/tess_4olshy9a', 'txt']




2024-09-16 01:01:05,813 - DEBUG - ['tesseract', '/tmp/tess_a818vemb_input.PNG', '/tmp/tess_a818vemb', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 01:01:07,292 - DEBUG - ['tesseract', '/tmp/tess_cml1ih8s_input.PNG', '/tmp/tess_cml1ih8s', 'txt']




2024-09-16 01:01:08,734 - DEBUG - ['tesseract', '/tmp/tess_edc4nmuu_input.PNG', '/tmp/tess_edc4nmuu', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 01:01:09,590 - DEBUG - ['tesseract', '/tmp/tess_e8updy1j_input.PNG', '/tmp/tess_e8updy1j', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 01:01:10,341 - DEBUG - ['tesseract', '/tmp/tess_jlzpern0_input.PNG', '/tmp/tess_jlzpern0', 'txt']




Premature end of JPEG file
Premature end of JPEG file
2024-09-16 01:01:11,151 - DEBUG - ['tesseract', '/tmp/tess_r0w1mtpp_input.PNG', '/tmp/tess_r0w1mtpp', 'txt']




2024-09-16 01:01:12,271 - DEBUG - ['tesseract', '/tmp/tess_xtz5ls6i_input.PNG', '/tmp/tess_xtz5ls6i', 'txt']




2024-09-16 01:01:15,136 - DEBUG - ['tesseract', '/tmp/tess_2i0sskdv_input.PNG', '/tmp/tess_2i0sskdv', 'txt']




2024-09-16 01:01:17,586 - DEBUG - ['tesseract', '/tmp/tess_6789vq3b_input.PNG', '/tmp/tess_6789vq3b', 'txt']




2024-09-16 01:01:19,160 - DEBUG - ['tesseract', '/tmp/tess_ipy123m7_input.PNG', '/tmp/tess_ipy123m7', 'txt']




2024-09-16 01:01:22,965 - DEBUG - ['tesseract', '/tmp/tess_u104ud4j_input.PNG', '/tmp/tess_u104ud4j', 'txt']




2024-09-16 01:01:24,740 - DEBUG - ['tesseract', '/tmp/tess_chu5y8u2_input.PNG', '/tmp/tess_chu5y8u2', 'txt']




2024-09-16 01:01:26,516 - DEBUG - ['tesseract', '/tmp/tess_725y05mf_input.PNG', '/tmp/tess_725y05mf', 'txt']




2024-09-16 01:01:31,525 - DEBUG - ['tesseract', '/tmp/tess_53uyk69f_input.PNG', '/tmp/tess_53uyk69f', 'txt']




2024-09-16 01:01:35,113 - DEBUG - ['tesseract', '/tmp/tess_a2o2suhk_input.PNG', '/tmp/tess_a2o2suhk', 'txt']




2024-09-16 01:01:37,085 - DEBUG - ['tesseract', '/tmp/tess_e3x5ronq_input.PNG', '/tmp/tess_e3x5ronq', 'txt']




2024-09-16 01:01:40,984 - DEBUG - ['tesseract', '/tmp/tess_k9s5i13i_input.PNG', '/tmp/tess_k9s5i13i', 'txt']




2024-09-16 01:01:43,521 - DEBUG - ['tesseract', '/tmp/tess_f4or21hd_input.PNG', '/tmp/tess_f4or21hd', 'txt']




2024-09-16 01:01:45,324 - DEBUG - ['tesseract', '/tmp/tess_1xuvjkkl_input.PNG', '/tmp/tess_1xuvjkkl', 'txt']




2024-09-16 01:01:46,581 - DEBUG - ['tesseract', '/tmp/tess_iivxah7s_input.PNG', '/tmp/tess_iivxah7s', 'txt']




2024-09-16 01:01:47,641 - DEBUG - ['tesseract', '/tmp/tess_zisa_6wr_input.PNG', '/tmp/tess_zisa_6wr', 'txt']




2024-09-16 01:01:50,112 - DEBUG - ['tesseract', '/tmp/tess_3xsmkmxt_input.PNG', '/tmp/tess_3xsmkmxt', 'txt']
[ WARN:0@1566.415] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/41-NCxNuBxL.jpg'): can't open/read file: check file path/integrity
[ WARN:0@1566.418] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/41-NCxNuBxL.jpg'): can't open/read file: check file path/integrity
[ WARN:0@1566.420] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/417NJrPEk+L.jpg'): can't open/read file: check file path/integrity
[ WARN:0@1566.424] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/417SThj+SrL.jpg'): can't open/read file: check file path/integrity
[ WARN:0@1566.427] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/417SThj+SrL.jpg'): can't open/read file: check file path/integrity
[ WARN

In [17]:
import pandas as pd
import pytesseract
import cv2
from tensorflow.keras.applications import EfficientNetB0
from sklearn.linear_model import LinearRegression
import os
import re
import logging
from PIL import Image
from utils import download_images
from constants import allowed_units

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

def check_and_repair_images(image_dir):
    """
    Check and repair corrupted images in the specified directory.
    """
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        try:
            with Image.open(file_path) as img:
                img.verify()  # Verify integrity
        except (IOError, SyntaxError) as e:
            logging.warning(f"Corrupted image detected: {file_path} - {e}")
            # Attempt to repair the image
            try:
                with Image.open(file_path) as img:
                    img = img.convert('RGB')  # Convert to RGB to fix possible issues
                    repaired_path = file_path.replace('.jpg', '_repaired.jpg')
                    img.save(repaired_path)
                    logging.info(f"Repaired image saved to {repaired_path}")
            except Exception as repair_exception:
                logging.error(f"Failed to repair image: {file_path} - {repair_exception}")

def preprocess_image(image_path):
    """Load and preprocess image for input into the CNN model."""
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Image at {image_path} could not be loaded.")
        image = cv2.resize(image, (224, 224))  # Resize to input size of EfficientNet
        image = image / 255.0  # Normalize pixel values
        return image
    except Exception as e:
        logging.warning(f"Error processing image {image_path}: {e}")
        return None

def extract_text_from_image(image_path):
    """Use OCR to extract text from images."""
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Image at {image_path} could not be loaded.")
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        logging.warning(f"Error performing OCR on image {image_path}: {e}")
        return ""

def extract_entity_value(text):
    """Extract numeric entity values and units using regex."""
    try:
        pattern = re.compile(r'(\d+(?:\.\d+)?)\s*([a-zA-Z]+)')
        matches = pattern.findall(text)
        
        if matches:
            value, unit = matches[0]
            # Validate the unit against the allowed units
            if unit.lower() in allowed_units:
                return value, unit.lower()  # Return the value and unit in lowercase
        return None, None
    except Exception as e:
        logging.warning(f"Error extracting entity value from text: {e}")
        return None, None

def extract_image_features(image_path, cnn_model):
    """Extract CNN features from the image."""
    image = preprocess_image(image_path)
    if image is None:
        return None  # Skip this image if it's invalid
    image = image.reshape((1, 224, 224, 3))
    try:
        features = cnn_model.predict(image)
        return features
    except Exception as e:
        logging.warning(f"Error extracting features from image {image_path}: {e}")
        return None

def train_model(train_df, download_folder, cnn_model):
    """Train a regression model using extracted features from images and text."""
    X_train = []
    y_train = []

    for idx, row in train_df.iterrows():
        image_path = os.path.join(download_folder, os.path.basename(row['image_link']))
        
        # Extract features using CNN
        image_features = extract_image_features(image_path, cnn_model)
        if image_features is None:
            logging.warning(f"Skipping image {image_path} due to feature extraction failure.")
            continue
        
        # Extract text using OCR
        text = extract_text_from_image(image_path)
        
        # Extract entity value from the text
        value, unit = extract_entity_value(text)
        if not value or not unit:
            logging.warning(f"Skipping image {image_path} due to text extraction failure.")
            continue
        
        # Combine CNN features and OCR features
        combined_features = list(image_features[0]) + [float(value)]
        X_train.append(combined_features)
        y_train.append(float(row['entity_value'].split()[0]))  # Extract ground truth value
    
    # Train a simple regression model
    if not X_train:
        logging.error("No valid data for training.")
        return None
    X_train = pd.DataFrame(X_train)
    reg_model = LinearRegression()
    reg_model.fit(X_train, y_train)
    return reg_model

def predict_and_generate_output(test_csv, reg_model, download_folder, cnn_model):
    """Generate predictions and save to CSV in the correct format."""
    try:
        test_df = pd.read_csv(test_csv)
        X_test = []
        test_indices = []
        predicted_units = []

        for idx, row in test_df.iterrows():
            image_path = os.path.join(download_folder, os.path.basename(row['image_link']))
            
            # Extract features using CNN
            image_features = extract_image_features(image_path, cnn_model)
            if image_features is None:
                logging.warning(f"Skipping image {image_path} due to feature extraction failure.")
                continue
            
            # Extract text using OCR
            text = extract_text_from_image(image_path)
            
            # Extract entity value from the text
            value, unit = extract_entity_value(text)
            if not value or not unit:
                logging.warning(f"Skipping image {image_path} due to text extraction failure.")
                continue
            
            # Combine CNN features and OCR features
            combined_features = list(image_features[0]) + [float(value)]
            X_test.append(combined_features)
            test_indices.append(row['index'])
            predicted_units.append(unit)
        
        if not X_test:
            logging.error("No valid data for predictions.")
            return
        
        X_test = pd.DataFrame(X_test)
        predictions = reg_model.predict(X_test)

        # Format the predictions
        output = pd.DataFrame({
            'index': test_indices, 
            'prediction': [f"{pred:.2f} {unit}" for pred, unit in zip(predictions, predicted_units)]
        })
        
        output.to_csv('test_out.csv', index=False)
        logging.info("Predictions saved to test_out.csv")
    except Exception as e:
        logging.error(f"Error during prediction and output generation: {e}")

def process_images(image_dir):
    """
    Process images in the specified directory.
    """
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        try:
            image = preprocess_image(file_path)
            if image is None:
                logging.warning(f"Image preprocessing failed for {file_path}")
                continue
            text = extract_text_from_image(file_path)
            if text is not None:
                logging.info(f"Text extracted from {file_path}: {text}")
            else:
                logging.warning(f"Text extraction failed for {file_path}")
        except Exception as e:
            logging.error(f"Error processing image {file_path}: {e}")

def main():
    # Define paths
    train_csv = '/home/rguktrkvalley/Desktop/train1.csv'
    test_csv = '/home/rguktrkvalley/Desktop/sample_test.csv'
    download_folder = '/home/rguktrkvalley/Desktop/downloaded_images'
    
    # Load CNN model only once
    cnn_model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg')
    
    try:
        # Step 1: Download images
        train_df = download_all_images(train_csv, download_folder)

        # Step 2: Check and repair images
        check_and_repair_images(download_folder)
        
        # Step 3: Process images (optional, depending on what needs to be done before training)
        process_images(download_folder)

        # Step 4: Train the model
        reg_model = train_model(train_df, download_folder, cnn_model)

        # Step 5: Generate predictions for the test set
        if reg_model:
            predict_and_generate_output(test_csv, reg_model, download_folder, cnn_model)
    except Exception as e:
        logging.error(f"Error during the main execution: {e}")

# Run the pipeline
if __name__ == '__main__':
    main()


100%|█████████████████████████████████████████| 99/99 [00:00<00:00, 4281.36it/s]
2024-09-16 00:26:58,673 - ERROR - Failed to repair image: /home/rguktrkvalley/Desktop/downloaded_images/71bvOuz9w1L.jpg - cannot identify image file '/home/rguktrkvalley/Desktop/downloaded_images/71bvOuz9w1L.jpg'
2024-09-16 00:26:58,705 - ERROR - Failed to repair image: /home/rguktrkvalley/Desktop/downloaded_images/61hWZdkq6WL.jpg - cannot identify image file '/home/rguktrkvalley/Desktop/downloaded_images/61hWZdkq6WL.jpg'
2024-09-16 00:26:58,731 - ERROR - Failed to repair image: /home/rguktrkvalley/Desktop/downloaded_images/71D824lbRvL.jpg - cannot identify image file '/home/rguktrkvalley/Desktop/downloaded_images/71D824lbRvL.jpg'
2024-09-16 00:26:58,744 - ERROR - Failed to repair image: /home/rguktrkvalley/Desktop/downloaded_images/51bQPPtMqYL.jpg - cannot identify image file '/home/rguktrkvalley/Desktop/downloaded_images/51bQPPtMqYL.jpg'
2024-09-16 00:26:58,752 - ERROR - Failed to repair image: /home/rgu







Premature end of JPEG file
Premature end of JPEG file












Premature end of JPEG file




Premature end of JPEG file








Premature end of JPEG file




Premature end of JPEG file












Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file












Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file








Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file












Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file












Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file












Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file








Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file








Premature end of JPEG file




Premature end of JPEG file




























Premature end of JPEG file




Premature end of JPEG file








Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file








Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file
Premature end of JPEG file




Premature end of JPEG file




























































[ WARN:0@4160.777] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/41-NCxNuBxL.jpg'): can't open/read file: check file path/integrity
[ WARN:0@4160.780] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/41-NCxNuBxL.jpg'): can't open/read file: check file path/integrity
[ WARN:0@4160.785] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/417NJrPEk+L.jpg'): can't open/read file: check file path/integrity
[ WARN:0@4160.792] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/417SThj+SrL.jpg'): can't open/read file: check file path/integrity
[ WARN:0@4160.796] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/417SThj+SrL.jpg'): can't open/read file: check file path/integrity
[ WARN:0@4160.801] global loadsave.cpp:241 findDecoder imread_('/home/rguktrkvalley/Desktop/downloaded_images/41ADV

In [None]:
# Downloading images

import csv
import urllib.request
import os

# Define the CSV file name
csv_file_name = ('/home/apiiit123/ml/student_resource_3/dataset/train.csv')

# Create a directory to save downloaded images
download_directory = 'downloaded_images'
os.makedirs(download_directory, exist_ok=True)

# Open the CSV file and read the image URLs
with open('/home/apiiit123/ml/student_resource_3/dataset/train.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    
    for row in reader:
        # Assuming the URL is in a column named 'image_link'
        url = row['image_link']
        
        # Create a filename based on another column (e.g., 'filename')
        filename = row.get('filename', 'default_image')  # Use 'default_image' if 'filename' is not provided
        file_path = os.path.join(download_directory, f"{filename}.jpg")
        
        try:
            # Download the image
            urllib.request.urlretrieve(url, file_path)
            print(f"Downloaded: {file_path}")
        except Exception as e:
            print(f"Failed to download {url}. Reason: {e}")

print("Image download process completed.")

In [None]:
# Load an image
from PIL import Image

# Load the image
img = Image.open('/home/apiiit123/ml/downloaded_images/default_image.jpg')

# Display the image
img.show()

In [None]:
# Image Preprocessing
import os
import cv2
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import img_to_array, load_img

# Define the directory containing images
image_dir = '/home/apiiit123/ml/downloaded_images'  # Change this to your image directory
processed_images_dir = 'processed_images'
os.makedirs(processed_images_dir, exist_ok=True)

# Parameters
target_size = (150, 150)  # Desired size for resizing
batch_size = 32  # Batch size for data augmentation

# Function to preprocess and save images
def preprocess_images(image_dir):
    for filename in os.listdir(image_dir):
        if filename.endswith('.jpg') or filename.endswith('.png'):
            img_path = os.path.join(image_dir, filename)
            # Load image
            img = load_img(img_path, target_size=target_size)
            img_array = img_to_array(img)
            img_array = img_array / 255.0  # Normalize to [0, 1]
            
            # Save processed image
            processed_img_path = os.path.join(processed_images_dir, filename)
            cv2.imwrite(processed_img_path, img_array * 255)  # Convert back to [0, 255] for saving

# Preprocess images
preprocess_images(image_dir)

# Data Augmentation
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Example of using the data generator
sample_image = load_img(os.path.join(processed_images_dir, 'default_image.jpg'), target_size=target_size)
sample_image_array = img_to_array(sample_image)
sample_image_array = np.expand_dims(sample_image_array, axis=0)

# Generate augmented images
augmented_images = datagen.flow(sample_image_array, batch_size=1)

# Save augmented images
for i in range(5):  # Generate and save 5 augmented images
    augmented_image = next(augmented_images)[0]
    cv2.imwrite(os.path.join(processed_images_dir, f'augmented_image_{i}.jpg'), augmented_image)

print("Image preprocessing and augmentation complete.")

In [None]:
import pytesseract
from PIL import Image, ImageFilter
import cv2
import numpy as np

# Set the path to the Tesseract executable
#pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# Load the image using OpenCV
image_path = '/home/apiiit123/ml/downloaded_images/default_image.jpg'  # Replace with your image path
image = cv2.imread(image_path)
    
# Convert the image to grayscale
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
# Apply Gaussian Blur to reduce noise
blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)
    
# Use adaptive thresholding to binarize the image
binary_image = cv2.adaptiveThreshold(blurred_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                         cv2.THRESH_BINARY, 11, 2)
    
# Save the processed image (optional, for debugging)
cv2.imwrite('processed_image.jpg', binary_image)
    
# Use Tesseract to extract text from the processed image
text = pytesseract.image_to_string(binary_image)
    
# Print the extracted text
print("Extracted Text:")
print(text)

In [2]:
import cv2
import pytesseract
from tensorflow.keras.applications import EfficientNetB0
from sklearn.linear_model import LinearRegression
import os
import re
import logging
from PIL import Image
from utils import download_images
from constants import allowed_units

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

def check_and_repair_images(image_dir):
    """
    Check and repair corrupted images in the specified directory.
    """
    for filename in os.listdir(image_dir):
        file_path = os.path.join(image_dir, filename)
        try:
            with Image.open(file_path) as img:
                img.verify()  # Verify integrity
        except (IOError, SyntaxError) as e:
            logging.warning(f"Corrupted image detected: {file_path} - {e}")
            # Attempt to repair the image
            try:
                with Image.open(file_path) as img:
                    img = img.convert('RGB')  # Convert to RGB to fix possible issues
                    repaired_path = file_path.replace('.jpg', '_repaired.jpg')
                    img.save(repaired_path)
                    logging.info(f"Repaired image saved to {repaired_path}")
            except Exception as repair_exception:
                logging.error(f"Failed to repair image: {file_path} - {repair_exception}")

def preprocess_image(image_path):
    """Load and preprocess image for input into the CNN model."""
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Image at {image_path} could not be loaded.")
        # Check image dimensions
        if image.shape[0] == 0 or image.shape[1] == 0:
            raise ValueError(f"Image at {image_path} has invalid dimensions.")
        image = cv2.resize(image, (224, 224))  # Resize to input size of EfficientNet
        image = image / 255.0  # Normalize pixel values
        return image
    except Exception as e:
        logging.warning(f"Error processing image {image_path}: {e}")
        return None

def extract_text_from_image(image_path):
    """Use OCR to extract text from images."""
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Image at {image_path} could not be loaded.")
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        logging.warning(f"Error performing OCR on image {image_path}: {e}")
        return ""

def extract_entity_value(text):
    """Extract numeric entity values and units using regex."""
    try:
        pattern = re.compile(r'(\d+(?:\.\d+)?)\s*([a-zA-Z]+)')
        matches = pattern.findall(text)

        if matches:
            value, unit = matches[0]
            # Validate the unit against the allowed units
            if unit.lower() in allowed_units:
                return value, unit.lower()  # Return the value and unit in lowercase
        return None, None
    except Exception as e:
        logging.warning(f"Error extracting entity value from text: {e}")
        return None, None

def extract_image_features(image_path, cnn_model):
    """Extract CNN features from the image."""
    image = preprocess_image(image_path)
    if image is None:
        return None  # Skip this image if it's invalid
    image = image.reshape((1, 224, 224, 3))
    try:
        features = cnn_model.predict(image)
        return features
    except Exception as e:
        logging.warning(f"Error extracting features from image {image_path}: {e}")
        return None

def train_model(train_df, download_folder, cnn_model):
    """Train a regression model using extracted features from images and text."""
    X_train = []
    y_train = []

    for idx, row in train_df.iterrows():
        image_path = os.path.join(download_folder, os.path.basename(row['image_link']))

        # Extract features using CNN
        image_features = extract_image_features(image_path, cnn_model)
        if image_features is None:
            logging.warning(f"Skipping image {image_path} due to feature extraction failure.")
            continue

        # Extract text using OCR
        text = extract_text_from_image(image_path)

        # Extract entity value from the text
        value, unit = extract_entity_value(text)
        if not value or not unit:
            logging.warning(f"Skipping image {image_path} due to text extraction failure.")
            continue

        # Combine CNN features and OCR features
        combined_features = list(image_features[0]) + [float(value)]
        X_train.append(combined_features)
        y_train.append(float(row['entity_value'].split()[0]))  # Extract ground truth value

    # Train a simple regression model
    if not X_train:
        logging.error("No valid data for training.")
        return None
    X_train = pd.DataFrame(X_train)
    reg_model = LinearRegression()
    reg_model.fit(X_train, y_train)
    return reg_model

def predict_and_generate_output(test_csv, reg_model, download_folder, cnn_model):
    """Generate predictions and save to CSV in the correct format."""
    test_df = pd.read_csv(test_csv)
    X_test = []
    test_indices = []
    predicted_units = []

    for idx, row in test_df.iterrows():
        image_path = os.path.join(download_folder, os.path.basename(row['image_link']))

        # Extract features using CNN
        image_features = extract_image_features(image_path, cnn_model)
        if image_features is None:
            logging.warning(f"Skipping image {image_path} due to feature extraction failure.")
            continue

        # Extract text using OCR
        text = extract_text_from_image(image_path)

        # Extract entity value from the text
        value, unit = extract_entity_value(text)
        if not value or not unit:
            logging.warning(f"Skipping image {image_path} due to text extraction failure.")
            continue

        # Combine CNN features and OCR features
        combined_features = list(image_features[0]) + [float(value)]
        X_test.append(combined_features)
        test_indices.append(row['index'])
        predicted_units.append(unit)

    if not X_test:
        logging.error("No valid data for predictions.")
        return
    X_test = pd.DataFrame(X_test)
    predictions = reg_model.predict(X_test)

    # Format the predictions
    output = pd.DataFrame({
        'index': test_indices,
        'prediction': [f"{pred:.2f} {unit}" for pred, unit in zip(predictions, predicted_units)]
    })

    output.to_csv('test_out.csv', index=False)
    logging.info("Predictions saved to test_out.csv")

# Step 8: Main Function to Run the Entire Process
def main():
    # Define paths
    train_csv = '/home/rguktrkvalley/Desktop/train1.csv'
    test_csv = '/home/rguktrkvalley/Desktop/sample_test.csv'
    download_folder = '/home/rguktrkvalley/Desktop/downloaded_images'

    # Load CNN model only once
    cnn_model = EfficientNetB0(weights='imagenet', include_top=False, pooling='avg')

    # Step 1: Download images
    train_df = download_all_images(train_csv, download_folder)

    # Step 2: Train the model
    reg_model = train_model(train_df, download_folder, cnn_model)

    # Step 3: Generate predictions for the test set
    if reg_model:
        predict_and_generate_output(test_csv, reg_model, download_folder, cnn_model)

# Run the pipeline
if __name__ == '__main__':
    main()

NameError: name 'download_all_images' is not defined

In [3]:
import os
import pandas as pd
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import img_to_array
from utils import download_images, parse_string
from constants import allowed_units
from pathlib import Path  # Importing Path

# Download images
train_df = pd.read_csv('/home/rguktrkvalley/Desktop/train1.csv')
test_df = pd.read_csv('/home/rguktrkvalley/Desktop/sample_test.csv')

download_folder = '/home/rguktrkvalley/Desktop/images'
download_images(train_df['image_link'].tolist() + test_df['image_link'].tolist(), download_folder)

# Load model
model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
model.trainable = False

inputs = keras.layers.Input(shape=(224, 224, 3))
x = model(inputs, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
outputs = keras.layers.Dense(len(allowed_units), activation='softmax')(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Preprocess images
def preprocess_image(image_path):
    image = Image.open(image_path)
    image = image.resize((224, 224))
    image_array = img_to_array(image)
    image_array = np.expand_dims(image_array, axis=0)
    image_array = tf.keras.applications.resnet50.preprocess_input(image_array)
    return image_array

# Train model
train_images = [os.path.join(download_folder, Path(link).name) for link in train_df['image_link']]
train_labels = train_df['entity_value'].apply(lambda x: parse_string(x)[1]).tolist()
train_labels = tf.keras.utils.to_categorical(train_labels, num_classes=len(allowed_units))

model.fit(np.concatenate([preprocess_image(path) for path in train_images]),
          train_labels,
          epochs=10,
          batch_size=32,
          validation_split=0.2)

# Generate predictions
test_images = [os.path.join(download_folder, Path(link).name) for link in test_df['image_link']]
test_preds = model.predict(np.concatenate([preprocess_image(path) for path in test_images]))

# Format output
output_df = pd.DataFrame({'index': test_df['index']})
output_df['prediction'] = test_preds.argmax(axis=1).apply(lambda x: ' '.join([str(1.0), list(allowed_units.keys())[x]]))
output_df.to_csv('/home/rguktrkvalley/Desktop/pertest_out.csv', index=False)


100%|█████████████████████████████████████████| 187/187 [00:05<00:00, 32.77it/s]


ValueError: Invalid format in 10 kilogram to 15 kilogram

In [None]:
import os
import pandas as pd
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import img_to_array
from utils import download_images, parse_string
from constants import allowed_units
from pathlib import Path

# Download images
train_df = pd.read_csv('/home/rguktrkvalley/Desktop/train1.csv')
test_df = pd.read_csv('/home/rguktrkvalley/Desktop/sample_test.csv')

download_folder = '/home/rguktrkvalley/Desktop/images'
download_images(train_df['image_link'].tolist() + test_df['image_link'].tolist(), download_folder)

# Load model
model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
model.trainable = False

inputs = keras.layers.Input(shape=(224, 224, 3))
x = model(inputs, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
outputs = keras.layers.Dense(len(allowed_units), activation='softmax')(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Preprocess images
def preprocess_image(image_path):
    try:
        image = Image.open(image_path)
        image = image.resize((224, 224))
        image_array = img_to_array(image)
        image_array = np.expand_dims(image_array, axis=0)
        image_array = tf.keras.applications.resnet50.preprocess_input(image_array)
        return image_array
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

# Prepare training data
train_images = [os.path.join(download_folder, Path(link).name) for link in train_df['image_link']]
train_labels = train_df['entity_value'].tolist()

# Filter out invalid labels
valid_labels = []
for label in train_labels:
    try:
        # Skip ranges or invalid formats
        if "to" in label:
            print(f"Skipping invalid label: {label} - Range not supported")
            continue
        number, unit = parse_string(label)
        valid_labels.append(unit)  # Only store the unit
    except ValueError as e:
        print(f"Skipping invalid label: {label} - {e}")

# Only keep valid images and labels
processed_images = []
valid_labels_filtered = []

for img_path, lbl in zip(train_images, valid_labels):
    processed_image = preprocess_image(img_path)
    if processed_image is not None:
        processed_images.append(processed_image)
        valid_labels_filtered.append(lbl)

# Convert labels to categorical format
train_labels_categorical = tf.keras.utils.to_categorical(
    [list(allowed_units).index(lbl) for lbl in valid_labels_filtered if lbl in allowed_units],
    num_classes=len(allowed_units)
)

# Train model
if processed_images and len(processed_images) == len(train_labels_categorical):
    model.fit(np.concatenate(processed_images),
              train_labels_categorical,
              epochs=10,
              batch_size=32,
              validation_split=0.2)
else:
    print("Mismatch in number of processed images and labels. Training aborted.")

# Generate predictions
test_images = [os.path.join(download_folder, Path(link).name) for link in test_df['image_link']]
test_preds = []

for path in test_images:
    processed_image = preprocess_image(path)
    if processed_image is not None:
        test_preds.append(processed_image)

# Format output
if test_preds:
    test_preds_array = model.predict(np.concatenate(test_preds))
    output_df = pd.DataFrame({'index': test_df['index']})
    output_df['prediction'] = [f"{1.0} {list(allowed_units)[x]}" for x in test_preds_array.argmax(axis=1)]
    output_df.to_csv('/home/rguktrkvalley/Desktop/pertest_out.csv', index=False)
else:
    print("No valid test images processed. Predictions cannot be generated.")


2024-09-16 09:59:25.048197: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-16 09:59:31.584457: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-16 09:59:32.088365: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
