In [5]:
import cv2
import numpy as np
import pandas as pd
import requests
import json
from tqdm import tqdm
from PIL import Image

def load_image_from_url(url, target_size=(128, 128)):
    if not url:
        return None
    try:
        response = requests.get(url, stream=True)
        img = Image.open(response.raw).convert('RGB')
        img = img.resize(target_size)
        img = np.array(img)
        return img
    except requests.RequestException as e:
        print(f"Error loading image from {url}: {e}")
        return None

def extract_first_image_url(image_list_str):
    if not image_list_str or image_list_str == '[]':
        return None
    try:
        image_list = json.loads(image_list_str.replace("'", '"'))
        if image_list:
            return image_list[0]
        else:
            return None
    except json.JSONDecodeError:
        return None

def calculate_orb_similarity(row, target_size=(128, 128)):
    if row['cgi_image_url'] is None or row['fgi_image_url'] is None:
        return None  # Skip calculation if any URL is missing
    image_cgi = load_image_from_url(row['cgi_image_url'], target_size)
    image_fgi = load_image_from_url(row['fgi_image_url'], target_size)
    if image_cgi is None or image_fgi is None:
        return None

    orb = cv2.ORB_create()
    keypoints_cgi, descriptors_cgi = orb.detectAndCompute(image_cgi, None)
    keypoints_fgi, descriptors_fgi = orb.detectAndCompute(image_fgi, None)

    if descriptors_cgi is None or descriptors_fgi is None:
        return None

    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(descriptors_cgi, descriptors_fgi)
    matches = sorted(matches, key=lambda x: x.distance)

    # Calculate similarity score based on matched keypoints
    similarity = sum(1 - match.distance / 100 for match in matches) / len(matches)
    return similarity

def calculate_dsift_similarity(row, target_size=(128, 128)):
    if row['cgi_image_url'] is None or row['fgi_image_url'] is None:
        return None  # Skip calculation if any URL is missing
    image_cgi = load_image_from_url(row['cgi_image_url'], target_size)
    image_fgi = load_image_from_url(row['fgi_image_url'], target_size)
    if image_cgi is None or image_fgi is None:
        return None

    sift = cv2.SIFT_create()
    step_size = 8
    kp_cgi = [cv2.KeyPoint(x, y, step_size) for y in range(0, image_cgi.shape[0], step_size) for x in range(0, image_cgi.shape[1], step_size)]
    kp_fgi = [cv2.KeyPoint(x, y, step_size) for y in range(0, image_fgi.shape[0], step_size) for x in range(0, image_fgi.shape[1], step_size)]

    _, dsift_cgi = sift.compute(image_cgi, kp_cgi)
    _, dsift_fgi = sift.compute(image_fgi, kp_fgi)

    if dsift_cgi is None or dsift_fgi is None:
        return None

    similarity = np.dot(dsift_cgi.flatten(), dsift_fgi.flatten()) / (np.linalg.norm(dsift_cgi) * np.linalg.norm(dsift_fgi))
    return similarity

# def calculate_gist_similarity(row, target_size=(128, 128)):
#     if row['cgi_image_url'] is None or row['fgi_image_url'] is None:
#         return None  # Skip calculation if any URL is missing
#     image_cgi = load_image_from_url(row['cgi_image_url'], target_size)
#     image_fgi = load_image_from_url(row['fgi_image_url'], target_size)
#     if image_cgi is None or image_fgi is None:
#         return None

#     gist_cgi = color_gist(image_cgi)
#     gist_fgi = color_gist(image_fgi)

#     similarity = np.dot(gist_cgi, gist_fgi) / (np.linalg.norm(gist_cgi) * np.linalg.norm(gist_fgi))
#     return similarity

def calculate_brief_similarity(row, target_size=(128, 128)):
    if row['cgi_image_url'] is None or row['fgi_image_url'] is None:
        return None  # Skip calculation if any URL is missing
    image_cgi = load_image_from_url(row['cgi_image_url'], target_size)
    image_fgi = load_image_from_url(row['fgi_image_url'], target_size)
    if image_cgi is None or image_fgi is None:
        return None

    gray_cgi = cv2.cvtColor(image_cgi, cv2.COLOR_BGR2GRAY)
    gray_fgi = cv2.cvtColor(image_fgi, cv2.COLOR_BGR2GRAY)

    star = cv2.xfeatures2d.StarDetector_create()
    brief = cv2.xfeatures2d.BriefDescriptorExtractor_create()

    keypoints_cgi = star.detect(gray_cgi, None)
    keypoints_fgi = star.detect(gray_fgi, None)

    keypoints_cgi, descriptors_cgi = brief.compute(gray_cgi, keypoints_cgi)
    keypoints_fgi, descriptors_fgi = brief.compute(gray_fgi, keypoints_fgi)

    if descriptors_cgi is None or descriptors_fgi is None:
        return None

    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(descriptors_cgi, descriptors_fgi)
    matches = sorted(matches, key=lambda x: x.distance)

    similarity = sum(1 - match.distance / 256 for match in matches) / len(matches)
    return similarity

def main():
    df = pd.read_csv('df_review.csv')
    df['fgi_image_url'] = df['fgi_images'].apply(extract_first_image_url)
    df['cgi_image_url'] = df['cgi_images'].apply(extract_first_image_url)

    tqdm.pandas(desc="Calculating ORB Similarities")
    df['orb_similarity'] = df.progress_apply(calculate_orb_similarity, axis=1)

    tqdm.pandas(desc="Calculating DSIFT Similarities")
    df['dsift_similarity'] = df.progress_apply(calculate_dsift_similarity, axis=1)

    # tqdm.pandas(desc="Calculating GIST Similarities")
    # df['gist_similarity'] = df.progress_apply(calculate_gist_similarity, axis=1)

    tqdm.pandas(desc="Calculating BRIEF Similarities")
    df['brief_similarity'] = df.progress_apply(calculate_brief_similarity, axis=1)

    # Save the complete DataFrame to CSV
    df.to_csv('df_review_orb_dsift_brief.csv', index=False)
    print("Complete DataFrame saved to 'df_review_orb.csv'.")

    # Drop the specified columns and save as a Stata DTA file
    # columns_to_drop = ['review_text', 'cgi_images', 'fgi_images', 'features']
    # df.drop(columns=columns_to_drop, errors='ignore').to_stata('df_review_orb.dta', write_index=False)
    print("Modified DataFrame saved to 'df_review_orb.dta' without specified columns.")

if __name__ == "__main__":
    main()

Calculating ORB Similarities: 100%|██████████| 9484/9484 [00:44<00:00, 213.01it/s]
Calculating DSIFT Similarities: 100%|██████████| 9484/9484 [00:46<00:00, 203.69it/s]
Calculating BRIEF Similarities: 100%|██████████| 9484/9484 [00:46<00:00, 202.11it/s]


Complete DataFrame saved to 'df_review_orb.csv'.
Modified DataFrame saved to 'df_review_orb.dta' without specified columns.


In [6]:
import pandas as pd

# Load the first CSV file
csv1 = pd.read_csv('df_review_orb_dsift_brief.csv')

# Load the second CSV file
csv2 = pd.read_csv('df_review.csv')

# Set 'Unnamed: 0' as the index for the first DataFrame
csv1.set_index('Unnamed: 0', inplace=True)

# Select the specific columns to merge from csv1
columns_to_merge = ['orb_similarity', 'dsift_similarity', 'brief_similarity']

# Merge the specified columns into csv2 based on the 'Unnamed: 0' index
csv2 = csv2.merge(csv1[columns_to_merge], how='left', left_index=True, right_index=True)

# Function to fill values only for rows below the first occurrence of non-missing values
def fill_below_first_non_na(df, columns):
    for col in columns:
        mask = df[col].notna()
        first_non_na_idx = mask.idxmax() if mask.any() else None
        if first_non_na_idx:
            df.loc[first_non_na_idx+1:, col] = df.loc[first_non_na_idx, col]
    return df

# Apply the function to each group of 'product_id'
csv2 = csv2.groupby('product_id', group_keys=False).apply(fill_below_first_non_na, columns=columns_to_merge)

# Fill NaN values with 0
csv2[columns_to_merge] = csv2[columns_to_merge].fillna(0)

# Save the merged DataFrame to a new CSV file
csv2.to_csv('df_review_orb_dsift_brief.csv', index=False)

print("The files have been successfully merged and saved to 'merged_file.csv'.")

The files have been successfully merged and saved to 'merged_file.csv'.


  csv2 = csv2.groupby('product_id', group_keys=False).apply(fill_below_first_non_na, columns=columns_to_merge)


In [7]:
# Import necessary libraries
import pandas as pd
from datetime import datetime

input_file = 'df_review_orb_dsift_brief'
input_format = '.csv'

# Function to load data
def load_data(filename):
    return pd.read_csv(filename)

# Function to process data
def process_data(df):
    # Identify duplicates based on product_id and review_date
    duplicates = df[df.duplicated(subset=['product_id', 'review_date'], keep=False)]

    # Print duplicates if they exist
    if not duplicates.empty:
        print("Duplicates found:")
        print(duplicates)
    else:
        print("No duplicates found.")

    # Drop duplicates based on product_id and review_date
    df = df.drop_duplicates(subset=['product_id', 'review_date'])
    
    # Convert review_date to the specified datetime format
    df['datetime'] = pd.to_datetime(df['review_date']).dt.strftime('%d%b%Y %H:%M:%S')
    
    # Extract month and year from the datetime
    df['mon'] = pd.to_datetime(df['review_date']).dt.month
    df['year'] = pd.to_datetime(df['review_date']).dt.year
    
    return df

# Main function to run the process
def main():
    # Load data
    df = load_data(input_file + input_format)
    
    # Process data
    df_processed = process_data(df)
    
    # Save processed data
    df_processed.to_csv(input_file + '.csv', index=False)
    # df_processed.to_csv(input_file + '_processed' + '.csv')
    # df_processed.to_stata(input_file + '_processed' + '.dta', write_index=False)
    
    print("Data processing complete. Files saved.")

# Run the script
if __name__ == "__main__":
    main()

No duplicates found.
Data processing complete. Files saved.
