In [1]:
import cv2
import numpy as np
import requests

def load_image_from_url(url):
    resp = requests.get(url)
    image = np.asarray(bytearray(resp.content), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    return image

def compute_normalized_histogram(image, bins=256):
    hist = []
    for i in range(3):  # For each color channel
        channel_hist = cv2.calcHist([image], [i], None, [bins], [0, 256])
        cv2.normalize(channel_hist, channel_hist, norm_type=cv2.NORM_L2)
        hist.append(channel_hist)
    return hist

def compare_histograms(hist1, hist2, method=cv2.HISTCMP_CORREL):
    # Sum of correlations across all channels
    similarity = sum(cv2.compareHist(hist1[i], hist2[i], method) for i in range(3)) / 3  # Averaging to keep it within 0-1
    return similarity

# Load images
url_cgi = "https://m.media-amazon.com/images/I/71FC3tE1mhL._AC_UL1500_.jpg"
url_fgi = "https://m.media-amazon.com/images/I/71FC3tE1mhL._AC_UL1500_.jpg"
image_cgi = load_image_from_url(url_cgi)
image_fgi = load_image_from_url(url_fgi)

# Compute histograms
hist_cgi = compute_normalized_histogram(image_cgi)
hist_fgi = compute_normalized_histogram(image_fgi)

# Compare histograms
similarity = compare_histograms(hist_cgi, hist_fgi)
print(f"Normalized Histogram Similarity: {similarity}")



Normalized Histogram Similarity: 1.0


In [13]:
import pandas as pd
import cv2
import numpy as np
import requests
import json
from tqdm import tqdm

def load_image_from_url(url):
    if not url:
        return None
    try:
        resp = requests.get(url)
        image = np.asarray(bytearray(resp.content), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        return image
    except requests.RequestException as e:
        print(f"Error loading image from {url}: {e}")
        return None

def compute_normalized_histogram(image, bins=256):
    if image is None:
        return None
    hist = []
    for i in range(3):  # For each color channel
        channel_hist = cv2.calcHist([image], [i], None, [bins], [0, 256])
        cv2.normalize(channel_hist, channel_hist, norm_type=cv2.NORM_L2)
        hist.append(channel_hist)
    return hist

def compare_histograms(hist1, hist2, method=cv2.HISTCMP_CORREL):
    if hist1 is None or hist2 is None:
        return None  # Return None if any histogram is missing
    similarity = sum(cv2.compareHist(hist1[i], hist2[i], method) for i in range(3)) / 3
    return similarity

def extract_first_image_url(image_list_str):
    if not image_list_str or image_list_str == '[]':
        return None
    try:
        image_list = json.loads(image_list_str.replace("'", '"'))
        if image_list:
            return image_list[0]
        else:
            return None
    except json.JSONDecodeError:
        return None

def calculate_similarity(row):
    if row['cgi_image_url'] is None or row['fgi_image_url'] is None:
        return None  # Skip calculation if any URL is missing
    image_cgi = load_image_from_url(row['cgi_image_url'])
    image_fgi = load_image_from_url(row['fgi_image_url'])
    hist_cgi = compute_normalized_histogram(image_cgi)
    hist_fgi = compute_normalized_histogram(image_fgi)
    return compare_histograms(hist_cgi, hist_fgi)

def main():
    df = pd.read_csv('df_review.csv')
    df['fgi_image_url'] = df['fgi_images'].apply(extract_first_image_url)
    df['cgi_image_url'] = df['cgi_images'].apply(extract_first_image_url)
    tqdm.pandas(desc="Calculating Color Similarities")
    df['color_similarity'] = df.progress_apply(calculate_similarity, axis=1)

    # Save the complete DataFrame to CSV
    df.to_csv('df_review_color.csv', index=False)
    print("Complete DataFrame saved to 'df_review_color.csv'.")

    # Drop the specified columns and save as a Stata DTA file
    columns_to_drop = ['review_text', 'cgi_images', 'fgi_images', 'features']
    df.drop(columns=columns_to_drop, errors='ignore').to_stata('df_review_color.dta', write_index=False)
    print("Modified DataFrame saved to 'df_review_color.dta' without specified columns.")

if __name__ == "__main__":
    main()

Calculating Color Similarities: 100%|██████████| 9484/9484 [00:29<00:00, 318.08it/s]
/var/folders/vz/_y_gw0w915v4z_x6ps3fjwwm0000gn/T/ipykernel_27492/1547458846.py:70: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    Unnamed: 0   ->   Unnamed__0

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)

  df.drop(columns=columns_to_drop, errors='ignore').to_stata('df_review_color.dta', write_index=False)


Complete DataFrame saved to 'df_review_color.csv'.
Modified DataFrame saved to 'df_review_color.dta' without specified columns.


In [2]:
import pandas as pd
import cv2
import numpy as np
import requests
import json
from tqdm import tqdm

def load_image_from_url(url):
    if not url:
        return None
    try:
        resp = requests.get(url)
        image = np.asarray(bytearray(resp.content), dtype="uint8")
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        return image
    except requests.RequestException as e:
        print(f"Error loading image from {url}: {e}")
        return None

def compute_normalized_histogram(image, bins=256):
    if image is None:
        return None
    hist = []
    for i in range(3):  # For each color channel
        channel_hist = cv2.calcHist([image], [i], None, [bins], [0, 256])
        cv2.normalize(channel_hist, channel_hist, norm_type=cv2.NORM_L2)
        hist.append(channel_hist)
    return hist

def compare_histograms(hist1, hist2, method):
    if hist1 is None or hist2 is None:
        return None  # Return None if any histogram is missing
    similarity = sum(cv2.compareHist(hist1[i], hist2[i], method) for i in range(3)) / 3
    return similarity

def extract_first_image_url(image_list_str):
    if not image_list_str or image_list_str == '[]':
        return None
    try:
        image_list = json.loads(image_list_str.replace("'", '"'))
        if image_list:
            return image_list[0]
        else:
            return None
    except json.JSONDecodeError:
        return None

def calculate_similarities(row):
    if row['cgi_image_url'] is None or row['fgi_image_url'] is None:
        return pd.Series([None, None, None, None])  # Skip calculation if any URL is missing
    image_cgi = load_image_from_url(row['cgi_image_url'])
    image_fgi = load_image_from_url(row['fgi_image_url'])
    hist_cgi = compute_normalized_histogram(image_cgi)
    hist_fgi = compute_normalized_histogram(image_fgi)
    corr = compare_histograms(hist_cgi, hist_fgi, cv2.HISTCMP_CORREL)
    chi_square = compare_histograms(hist_cgi, hist_fgi, cv2.HISTCMP_CHISQR)
    intersection = compare_histograms(hist_cgi, hist_fgi, cv2.HISTCMP_INTERSECT)
    bhattacharyya = compare_histograms(hist_cgi, hist_fgi, cv2.HISTCMP_BHATTACHARYYA)
    return pd.Series([corr, chi_square, intersection, bhattacharyya])

def main():
    df = pd.read_csv('df_review.csv')
    df['fgi_image_url'] = df['fgi_images'].apply(extract_first_image_url)
    df['cgi_image_url'] = df['cgi_images'].apply(extract_first_image_url)
    
    tqdm.pandas(desc="Calculating Color Similarities")
    df[['color_similarity_corr', 'color_similarity_chisqr', 'color_similarity_intersect', 'color_similarity_bhattacharyya']] = df.progress_apply(calculate_similarities, axis=1)

    # Scale color similarity scores if necessary (e.g., multiply by 100)
    df['color_similarity_corr_scaled'] = df['color_similarity_corr'] * 100
    df['color_similarity_chisqr_scaled'] = df['color_similarity_chisqr'] * 100
    df['color_similarity_intersect_scaled'] = df['color_similarity_intersect'] * 100
    df['color_similarity_bhattacharyya_scaled'] = (1 - df['color_similarity_bhattacharyya']) * 100  # Inverted and scaled

    # Save the complete DataFrame to CSV
    df.to_csv('df_review_color.csv', index=False)
    print("Complete DataFrame saved to 'df_review_color.csv'.")

    # Drop the specified columns and save as a Stata DTA file
    columns_to_drop = ['review_text', 'cgi_images', 'fgi_images', 'features']
    df.drop(columns=columns_to_drop, errors='ignore').to_stata('df_review_color.dta', write_index=False)
    print("Modified DataFrame saved to 'df_review_color.dta' without specified columns.")

if __name__ == "__main__":
    main()


Calculating Color Similarities: 100%|██████████| 9484/9484 [00:30<00:00, 314.68it/s]
/var/folders/vz/_y_gw0w915v4z_x6ps3fjwwm0000gn/T/ipykernel_49918/1888943578.py:81: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    Unnamed: 0   ->   Unnamed__0
    color_similarity_intersect_scaled   ->   color_similarity_intersect_scale
    color_similarity_bhattacharyya_scaled   ->   color_similarity_bhattacharyya_s

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)

  df.drop(columns=columns_to_drop, errors='ignore').to_stata('df_review_color.dta', write_index=False)


Complete DataFrame saved to 'df_review_color.csv'.
Modified DataFrame saved to 'df_review_color.dta' without specified columns.


In [6]:
import pandas as pd

# Load the first CSV file
csv1 = pd.read_csv('df_review_color.csv')

# Load the second CSV file
csv2 = pd.read_csv('df_review.csv')

# Set 'Unnamed: 0' as the index for the first DataFrame
csv1.set_index('Unnamed: 0', inplace=True)

# Select the specific columns to merge from csv1
columns_to_merge = ['color_similarity_corr_scaled', 'color_similarity_chisqr_scaled', 
                    'color_similarity_intersect_scaled', 'color_similarity_bhattacharyya_scaled']

# Merge the specified columns into csv2 based on the 'Unnamed: 0' index
csv2 = csv2.merge(csv1[columns_to_merge], how='left', left_index=True, right_index=True)

# Function to fill values only for rows below the first occurrence of non-missing values
def fill_below_first_non_na(df, columns):
    for col in columns:
        mask = df[col].notna()
        first_non_na_idx = mask.idxmax() if mask.any() else None
        if first_non_na_idx:
            df.loc[first_non_na_idx+1:, col] = df.loc[first_non_na_idx, col]
    return df

# Apply the function to each group of 'product_id'
csv2 = csv2.groupby('product_id', group_keys=False).apply(fill_below_first_non_na, columns=columns_to_merge)

# Fill NaN values with 0
csv2[columns_to_merge] = csv2[columns_to_merge].fillna(0)

# Save the merged DataFrame to a new CSV file
csv2.to_csv('df_review_color_merge.csv', index=False)

print("The files have been successfully merged and saved to 'merged_file.csv'.")


The files have been successfully merged and saved to 'merged_file.csv'.


  csv2 = csv2.groupby('product_id', group_keys=False).apply(fill_below_first_non_na, columns=columns_to_merge)


In [7]:
# Import necessary libraries
import pandas as pd
from datetime import datetime

input_file = 'df_review_color_merge'
input_format = '.csv'

# Function to load data
def load_data(filename):
    return pd.read_csv(filename)

# Function to process data
def process_data(df):
    # Identify duplicates based on product_id and review_date
    duplicates = df[df.duplicated(subset=['product_id', 'review_date'], keep=False)]

    # Print duplicates if they exist
    if not duplicates.empty:
        print("Duplicates found:")
        print(duplicates)
    else:
        print("No duplicates found.")

    # Drop duplicates based on product_id and review_date
    df = df.drop_duplicates(subset=['product_id', 'review_date'])
    
    # Convert review_date to the specified datetime format
    df['datetime'] = pd.to_datetime(df['review_date']).dt.strftime('%d%b%Y %H:%M:%S')
    
    # Extract month and year from the datetime
    df['mon'] = pd.to_datetime(df['review_date']).dt.month
    df['year'] = pd.to_datetime(df['review_date']).dt.year
    
    return df

# Main function to run the process
def main():
    # Load data
    df = load_data(input_file + input_format)
    
    # Process data
    df_processed = process_data(df)
    
    # Save processed data
    df_processed.to_csv(input_file + '_processed' + '.csv')
    # df_processed.to_stata(input_file + '_processed' + '.dta', write_index=False)
    
    print("Data processing complete. Files saved.")

# Run the script
if __name__ == "__main__":
    main()

No duplicates found.
Data processing complete. Files saved.
