In [2]:
import cv2
import numpy as np
from skimage import feature
import pandas as pd
import requests
import json
from tqdm import tqdm

def load_image_from_url(url):
    if not url:
        return None
    try:
        resp = requests.get(url)
        image = np.asarray(bytearray(resp.content), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        return image
    except requests.RequestException as e:
        print(f"Error loading image from {url}: {e}")
        return None

def extract_first_image_url(image_list_str):
    if not image_list_str or image_list_str == '[]':
        return None
    try:
        image_list = json.loads(image_list_str.replace("'", '"'))
        if image_list:
            return image_list[0]
        else:
            return None
    except json.JSONDecodeError:
        return None

def calculate_sift_similarity(row):
    if row['cgi_image_url'] is None or row['fgi_image_url'] is None:
        return None  # Skip calculation if any URL is missing
    image_cgi = load_image_from_url(row['cgi_image_url'])
    image_fgi = load_image_from_url(row['fgi_image_url'])
    if image_cgi is None or image_fgi is None:
        return None
    sift = cv2.SIFT_create()
    kp1, des1 = sift.detectAndCompute(image_cgi, None)
    kp2, des2 = sift.detectAndCompute(image_fgi, None)
    if des1 is None or des2 is None:
        return 0  # No keypoints detected
    bf = cv2.BFMatcher()
    matches = bf.knnMatch(des1, des2, k=2)
    good_matches = [m for m, n in matches if m.distance < 0.75 * n.distance]
    return len(good_matches)


def calculate_lbp_similarity(row):
    if row['cgi_image_url'] is None or row['fgi_image_url'] is None:
        return None  # Skip calculation if any URL is missing
    image_cgi = load_image_from_url(row['cgi_image_url'])
    image_fgi = load_image_from_url(row['fgi_image_url'])
    if image_cgi is None or image_fgi is None:
        return None
    image_cgi_gray = cv2.cvtColor(image_cgi, cv2.COLOR_BGR2GRAY)
    image_fgi_gray = cv2.cvtColor(image_fgi, cv2.COLOR_BGR2GRAY)
    lbp_cgi = feature.local_binary_pattern(image_cgi_gray, P=24, R=3, method='uniform')
    lbp_fgi = feature.local_binary_pattern(image_fgi_gray, P=24, R=3, method='uniform')
    (hist_cgi, _) = np.histogram(lbp_cgi, bins=np.arange(0, 27), range=(0, 26))
    (hist_fgi, _) = np.histogram(lbp_fgi, bins=np.arange(0, 27), range=(0, 26))
    hist_cgi = hist_cgi.astype("float32")
    hist_fgi = hist_fgi.astype("float32")
    hist_cgi /= (hist_cgi.sum() + 1e-6)
    hist_fgi /= (hist_fgi.sum() + 1e-6)
    similarity = cv2.compareHist(hist_cgi, hist_fgi, cv2.HISTCMP_CORREL)
    return similarity

def main():
    df = pd.read_csv('df_review.csv')
    df['fgi_image_url'] = df['fgi_images'].apply(extract_first_image_url)
    df['cgi_image_url'] = df['cgi_images'].apply(extract_first_image_url)

    tqdm.pandas(desc="Calculating LBP Similarities")
    df['lbp_similarity'] = df.progress_apply(calculate_lbp_similarity, axis=1)

    tqdm.pandas(desc="Calculating SIFT Similarities")
    df['sift_similarity'] = df.progress_apply(calculate_sift_similarity, axis=1)

    # Save the complete DataFrame to CSV
    df.to_csv('df_review_sift_lbp.csv', index=False)
    print("Complete DataFrame saved to 'df_review_lbp.csv'.")

    # Drop the specified columns and save as a Stata DTA file
    # columns_to_drop = ['review_text', 'cgi_images', 'fgi_images', 'features']
    # df.drop(columns=columns_to_drop, errors='ignore').to_stata('df_review_lbp.dta', write_index=False)
    print("Modified DataFrame saved to 'df_review_lbp.dta' without specified columns.")

if __name__ == "__main__":
    main()


Calculating LBP Similarities: 100%|██████████| 9484/9484 [03:06<00:00, 50.83it/s] 
Calculating SIFT Similarities: 100%|██████████| 9484/9484 [01:34<00:00, 99.91it/s] 


Complete DataFrame saved to 'df_review_lbp.csv'.
Modified DataFrame saved to 'df_review_lbp.dta' without specified columns.


In [3]:
import pandas as pd

# Load the first CSV file
csv1 = pd.read_csv('df_review_sift_lbp.csv')

# Load the second CSV file
csv2 = pd.read_csv('df_review.csv')

# Set 'Unnamed: 0' as the index for the first DataFrame
csv1.set_index('Unnamed: 0', inplace=True)

# Select the specific columns to merge from csv1
columns_to_merge = ['lbp_similarity', 'sift_similarity']

# Merge the specified columns into csv2 based on the 'Unnamed: 0' index
csv2 = csv2.merge(csv1[columns_to_merge], how='left', left_index=True, right_index=True)

# Function to fill values only for rows below the first occurrence of non-missing values
def fill_below_first_non_na(df, columns):
    for col in columns:
        mask = df[col].notna()
        first_non_na_idx = mask.idxmax() if mask.any() else None
        if first_non_na_idx:
            df.loc[first_non_na_idx+1:, col] = df.loc[first_non_na_idx, col]
    return df

# Apply the function to each group of 'product_id'
csv2 = csv2.groupby('product_id', group_keys=False).apply(fill_below_first_non_na, columns=columns_to_merge)

# Fill NaN values with 0
csv2[columns_to_merge] = csv2[columns_to_merge].fillna(0)

# Save the merged DataFrame to a new CSV file
csv2.to_csv('df_review_sift_lbp.csv', index=False)

print("The files have been successfully merged and saved to 'merged_file.csv'.")

The files have been successfully merged and saved to 'merged_file.csv'.


  csv2 = csv2.groupby('product_id', group_keys=False).apply(fill_below_first_non_na, columns=columns_to_merge)


In [4]:
# Import necessary libraries
import pandas as pd
from datetime import datetime

input_file = 'df_review_sift_lbp'
input_format = '.csv'

# Function to load data
def load_data(filename):
    return pd.read_csv(filename)

# Function to process data
def process_data(df):
    # Identify duplicates based on product_id and review_date
    duplicates = df[df.duplicated(subset=['product_id', 'review_date'], keep=False)]

    # Print duplicates if they exist
    if not duplicates.empty:
        print("Duplicates found:")
        print(duplicates)
    else:
        print("No duplicates found.")

    # Drop duplicates based on product_id and review_date
    df = df.drop_duplicates(subset=['product_id', 'review_date'])
    
    # Convert review_date to the specified datetime format
    df['datetime'] = pd.to_datetime(df['review_date']).dt.strftime('%d%b%Y %H:%M:%S')
    
    # Extract month and year from the datetime
    df['mon'] = pd.to_datetime(df['review_date']).dt.month
    df['year'] = pd.to_datetime(df['review_date']).dt.year
    
    return df

# Main function to run the process
def main():
    # Load data
    df = load_data(input_file + input_format)
    
    # Process data
    df_processed = process_data(df)
    
    # Save processed data
    df_processed.to_csv(input_file + '.csv', index=False)
    # df_processed.to_csv(input_file + '_processed' + '.csv')
    # df_processed.to_stata(input_file + '_processed' + '.dta', write_index=False)
    
    print("Data processing complete. Files saved.")

# Run the script
if __name__ == "__main__":
    main()

No duplicates found.
Data processing complete. Files saved.
