In [13]:
from imagehash import phash
from PIL import Image
import cv2
import numpy as np
import pandas as pd
import requests
import json
from tqdm import tqdm
from skimage.feature import hog

def load_image_from_url_target_size(url, target_size=(128, 128)):
    if not url:
        return None
    try:
        resp = requests.get(url)
        image = np.asarray(bytearray(resp.content), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        # Resize image to target size
        image = cv2.resize(image, target_size)
        return image
    except requests.RequestException as e:
        print(f"Error loading image from {url}: {e}")
        return None

def extract_first_image_url(image_list_str):
    if not image_list_str or image_list_str == '[]':
        return None
    try:
        image_list = json.loads(image_list_str.replace("'", '"'))
        if image_list:
            return image_list[0]
        else:
            return None
    except json.JSONDecodeError:
        return None
    
def load_image_from_url(url):
    if not url:
        return None
    try:
        response = requests.get(url, stream=True)
        image = Image.open(response.raw).convert('RGB')
        return image
    except requests.RequestException as e:
        print(f"Error loading image from {url}: {e}")
        return None

def calculate_hog_similarity(row, target_size=(128, 128)):
    if row['cgi_image_url'] is None or row['fgi_image_url'] is None:
        return None  # Skip calculation if any URL is missing
    image_cgi = load_image_from_url_target_size(row['cgi_image_url'], target_size)
    image_fgi = load_image_from_url_target_size(row['fgi_image_url'], target_size)
    if image_cgi is None or image_fgi is None:
        return None

    image_cgi_gray = cv2.cvtColor(image_cgi, cv2.COLOR_BGR2GRAY)
    image_fgi_gray = cv2.cvtColor(image_fgi, cv2.COLOR_BGR2GRAY)

    # Calculate HOG features
    hog_cgi, _ = hog(image_cgi_gray, pixels_per_cell=(8, 8), cells_per_block=(2, 2), visualize=True)
    hog_fgi, _ = hog(image_fgi_gray, pixels_per_cell=(8, 8), cells_per_block=(2, 2), visualize=True)

    # Calculate similarity (cosine similarity)
    similarity = np.dot(hog_cgi, hog_fgi) / (np.linalg.norm(hog_cgi) * np.linalg.norm(hog_fgi))
    return similarity

def calculate_phash_similarity(row):
    if row['cgi_image_url'] is None or row['fgi_image_url'] is None:
        return None  # Skip calculation if any URL is missing
    image_cgi = load_image_from_url(row['cgi_image_url'])
    image_fgi = load_image_from_url(row['fgi_image_url'])
    if image_cgi is None or image_fgi is None:
        return None

    hash_cgi = phash(image_cgi)
    hash_fgi = phash(image_fgi)

    similarity = 1 - (hash_cgi - hash_fgi) / len(hash_cgi.hash) ** 2
    return similarity

def calculate_cld(image, num_y_coeffs=8, num_cb_coeffs=4, num_cr_coeffs=4):
    image = image.resize((64, 64))
    ycbcr_image = image.convert('YCbCr')
    y, cb, cr = ycbcr_image.split()
    y = np.asarray(y, dtype=np.float32)
    cb = np.asarray(cb, dtype=np.float32)
    cr = np.asarray(cr, dtype=np.float32)

    y_dct = cv2.dct(y)
    cb_dct = cv2.dct(cb)
    cr_dct = cv2.dct(cr)

    cld = np.hstack((
        y_dct[:num_y_coeffs, :num_y_coeffs].flatten(),
        cb_dct[:num_cb_coeffs, :num_cb_coeffs].flatten(),
        cr_dct[:num_cr_coeffs, :num_cr_coeffs].flatten()
    ))

    return cld

def calculate_cld_similarity(row):
    if row['cgi_image_url'] is None or row['fgi_image_url'] is None:
        return None  # Skip calculation if any URL is missing
    image_cgi = load_image_from_url(row['cgi_image_url'])
    image_fgi = load_image_from_url(row['fgi_image_url'])
    if image_cgi is None or image_fgi is None:
        return None

    cld_cgi = calculate_cld(image_cgi)
    cld_fgi = calculate_cld(image_fgi)

    similarity = np.dot(cld_cgi, cld_fgi) / (np.linalg.norm(cld_cgi) * np.linalg.norm(cld_fgi))
    return similarity

def main():
    df = pd.read_csv('df_review.csv')
    df['fgi_image_url'] = df['fgi_images'].apply(extract_first_image_url)
    df['cgi_image_url'] = df['cgi_images'].apply(extract_first_image_url)

    tqdm.pandas(desc="Calculating HOG Similarities")
    df['hog_similarity'] = df.progress_apply(calculate_hog_similarity, axis=1)

    tqdm.pandas(desc="Calculating pHash Similarities")
    df['phash_similarity'] = df.progress_apply(calculate_phash_similarity, axis=1)

    tqdm.pandas(desc="Calculating CLD Similarities")
    df['cld_similarity'] = df.progress_apply(calculate_cld_similarity, axis=1)

    # Save the complete DataFrame to CSV
    df.to_csv('df_review_hog_phash_cld.csv', index=False)
    print("Complete DataFrame saved to 'df_review_hog_phash.csv'.")

    # Drop the specified columns and save as a Stata DTA file
    # columns_to_drop = ['review_text', 'cgi_images', 'fgi_images', 'features']
    # df.drop(columns=columns_to_drop, errors='ignore').to_stata('df_review_hog.dta', write_index=False)
    print("Modified DataFrame saved to 'df_review_hog.dta' without specified columns.")

if __name__ == "__main__":
    main()


Calculating HOG Similarities: 100%|██████████| 9484/9484 [02:02<00:00, 77.67it/s] 
Calculating pHash Similarities: 100%|██████████| 9484/9484 [00:31<00:00, 303.36it/s]
Calculating CLD Similarities: 100%|██████████| 9484/9484 [01:31<00:00, 103.35it/s]


Complete DataFrame saved to 'df_review_hog_phash.csv'.
Modified DataFrame saved to 'df_review_hog.dta' without specified columns.


In [15]:
import pandas as pd

# Load the first CSV file
csv1 = pd.read_csv('df_review_hog_phash_cld.csv')

# Load the second CSV file
csv2 = pd.read_csv('df_review.csv')

# Set 'Unnamed: 0' as the index for the first DataFrame
csv1.set_index('Unnamed: 0', inplace=True)

# Select the specific columns to merge from csv1
columns_to_merge = ['hog_similarity', 'phash_similarity', 'cld_similarity']

# Merge the specified columns into csv2 based on the 'Unnamed: 0' index
csv2 = csv2.merge(csv1[columns_to_merge], how='left', left_index=True, right_index=True)

# Function to fill values only for rows below the first occurrence of non-missing values
def fill_below_first_non_na(df, columns):
    for col in columns:
        mask = df[col].notna()
        first_non_na_idx = mask.idxmax() if mask.any() else None
        if first_non_na_idx:
            df.loc[first_non_na_idx+1:, col] = df.loc[first_non_na_idx, col]
    return df

# Apply the function to each group of 'product_id'
csv2 = csv2.groupby('product_id', group_keys=False).apply(fill_below_first_non_na, columns=columns_to_merge)

# Fill NaN values with 0
csv2[columns_to_merge] = csv2[columns_to_merge].fillna(0)

# Save the merged DataFrame to a new CSV file
csv2.to_csv('df_review_hog_phash_cld.csv', index=False)

print("The files have been successfully merged and saved to 'merged_file.csv'.")

The files have been successfully merged and saved to 'merged_file.csv'.


  csv2 = csv2.groupby('product_id', group_keys=False).apply(fill_below_first_non_na, columns=columns_to_merge)


In [16]:
# Import necessary libraries
import pandas as pd
from datetime import datetime

input_file = 'df_review_hog_phash_cld'
input_format = '.csv'

# Function to load data
def load_data(filename):
    return pd.read_csv(filename)

# Function to process data
def process_data(df):
    # Identify duplicates based on product_id and review_date
    duplicates = df[df.duplicated(subset=['product_id', 'review_date'], keep=False)]

    # Print duplicates if they exist
    if not duplicates.empty:
        print("Duplicates found:")
        print(duplicates)
    else:
        print("No duplicates found.")

    # Drop duplicates based on product_id and review_date
    df = df.drop_duplicates(subset=['product_id', 'review_date'])
    
    # Convert review_date to the specified datetime format
    df['datetime'] = pd.to_datetime(df['review_date']).dt.strftime('%d%b%Y %H:%M:%S')
    
    # Extract month and year from the datetime
    df['mon'] = pd.to_datetime(df['review_date']).dt.month
    df['year'] = pd.to_datetime(df['review_date']).dt.year
    
    return df

# Main function to run the process
def main():
    # Load data
    df = load_data(input_file + input_format)
    
    # Process data
    df_processed = process_data(df)
    
    # Save processed data
    df_processed.to_csv(input_file + '.csv', index=False)
    # df_processed.to_csv(input_file + '_processed' + '.csv')
    # df_processed.to_stata(input_file + '_processed' + '.dta', write_index=False)
    
    print("Data processing complete. Files saved.")

# Run the script
if __name__ == "__main__":
    main()

No duplicates found.
Data processing complete. Files saved.
