In [1]:
import pandas as pd
import cv2
import numpy as np
import requests
import json
from tqdm import tqdm
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import img_to_array
from keras.models import Model

# Load pre-trained VGG16 model + higher level layers
base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)

def load_image_from_url(url, target_size=(224, 224)):
    if not url:
        return None
    try:
        resp = requests.get(url)
        image = np.asarray(bytearray(resp.content), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        image = cv2.resize(image, target_size)
        return image
    except requests.RequestException as e:
        print(f"Error loading image from {url}: {e}")
        return None

def extract_first_image_url(image_list_str):
    if not image_list_str or image_list_str == '[]':
        return None
    try:
        image_list = json.loads(image_list_str.replace("'", '"'))
        if image_list:
            return image_list[0]
        else:
            return None
    except json.JSONDecodeError:
        return None

def extract_features(image):
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    features = model.predict(image)
    return features.flatten()

def calculate_feature_similarity(row):
    if row['cgi_image_url'] is None or row['fgi_image_url'] is None:
        return None  # Skip calculation if any URL is missing
    image_cgi = load_image_from_url(row['cgi_image_url'])
    image_fgi = load_image_from_url(row['fgi_image_url'])
    if image_cgi is None or image_fgi is None:
        return None
    features_cgi = extract_features(image_cgi)
    features_fgi = extract_features(image_fgi)
    similarity = np.dot(features_cgi, features_fgi) / (np.linalg.norm(features_cgi) * np.linalg.norm(features_fgi))
    return similarity

def main():
    df = pd.read_csv('df_review.csv')
    df['fgi_image_url'] = df['fgi_images'].apply(extract_first_image_url)
    df['cgi_image_url'] = df['cgi_images'].apply(extract_first_image_url)

    tqdm.pandas(desc="Calculating Feature Similarities")
    df['cnn_similarity'] = df.progress_apply(calculate_feature_similarity, axis=1)

    # Save the complete DataFrame to CSV
    df.to_csv('df_review_cnn.csv', index=False)
    print("Complete DataFrame saved to 'df_review_cnn.csv'.")

    # Drop the specified columns and save as a Stata DTA file
    columns_to_drop = ['review_text', 'cgi_images', 'fgi_images', 'features']
    df.drop(columns=columns_to_drop, errors='ignore').to_stata('df_review_cnn.dta', write_index=False)
    print("Modified DataFrame saved to 'df_review_cnn.dta' without specified columns.")

if __name__ == "__main__":
    main()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
[1m553467096/553467096[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 0us/step


Calculating Feature Similarities:   0%|          | 0/9484 [00:00<?, ?it/s]