In [30]:
# Required imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import re
import json

In [31]:
# 1. Load Datasets
def load_amazon_reviews():
    return pd.read_csv('../data/amazon_com-product_reviews_sample.csv')

def load_content_based():
    return pd.read_csv('../data/content_based_recommendation_dataset.csv')

def load_reviews_ratings():
    return pd.read_csv('../data/review_and_ratings.csv')


In [32]:
# 2. Clean Data
def clean_text(text):
    # Remove special characters and lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    return text.lower().strip()

def get_sentiment(text):
    return TextBlob(str(text)).sentiment.polarity

In [43]:
# 3. Process Amazon Reviews
def process_amazon_reviews(df):
    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Clean text columns using .loc
    df.loc[:, 'clean_description'] = df['Product Description'].apply(clean_text)
    df.loc[:, 'clean_review'] = df['Review Content'].apply(clean_text)
    
    # Add sentiment scores
    df.loc[:, 'sentiment_score'] = df['Review Content'].apply(get_sentiment)
    
    return df

In [44]:
# 4. Process Content Based Dataset
def process_content_based(df):
    # Create a copy of the DataFrame
    df = df.copy()
    
    # Print columns for debugging
    print("Available columns:", df.columns.tolist())
    
    # Get numerical columns dynamically
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    print("Numerical columns identified:", numerical_cols)
    
    if numerical_cols:
        # Normalize numerical columns safely using .loc
        for col in numerical_cols:
            if df[col].max() != df[col].min():
                df.loc[:, col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    
    return df

In [51]:
# 5. Merge Datasets
def create_final_dataset(amazon_df, content_df, reviews_df):
    # Create a copy of the DataFrame
    amazon_df = amazon_df.copy()
    
    # Print column info to debug
    print("Amazon DataFrame Info:")
    print(amazon_df.info())
    print("\nSample of Category and Sub Category:")
    print(amazon_df[['Category', 'Sub Category']].head())
    
    # Create final dataframe
    final_df = pd.DataFrame()
    
    # Safe string conversion function
    def safe_lower(x):
        if pd.isna(x):
            return 'unknown'
        return str(x).lower()
    
    # Process Amazon data
    final_df['name'] = amazon_df['Brand'].fillna('Unknown Brand') + ' - ' + amazon_df['Category'].fillna('Unknown Category')
    final_df['description'] = amazon_df['clean_description'].fillna('')
    
    # Create preferences from categories and sentiment
    final_df['preferences'] = amazon_df.apply(
        lambda row: [
            safe_lower(row['Category']),
            safe_lower(row['Sub Category']),
            'positive' if row.get('sentiment_score', 0) > 0 else 'negative',
            'highly_rated' if row.get('Rating', 0) >= 4 else 'low_rated'
        ], axis=1
    )
    
    # Add price information from content_based dataset
    if len(content_df) > 0 and 'Price of the product' in content_df.columns:
        price_col = 'Price of the product'
    elif len(content_df) > 0 and 'price' in content_df.columns:
        price_col = 'price'
    else:
        price_col = None
    
    if price_col:
        final_df['price'] = content_df[price_col].values[:len(final_df)]
    else:
        final_df['price'] = 0
    
    # Add relationship based on category
    category_mapping = {
        'electronics': 'tech_enthusiast',
        'clothing': 'fashion_lover',
        'books': 'reader',
        'home & kitchen': 'homemaker'
    }
    
    final_df['relationship'] = amazon_df['Category'].apply(
        lambda x: category_mapping.get(safe_lower(x), 'general')
    )
    
    return final_df[['name', 'description', 'preferences', 'price', 'relationship']]

# Try creating and saving the final dataset with error handling
try:
    final_dataset = create_final_dataset(processed_amazon, processed_content, reviews_df)
    print("\nFinal Dataset Info:")
    print(final_dataset.info())
    print("\nSample of Final Dataset:")
    print(final_dataset.head(2))
    
    # Save the dataset
    save_processed_data(final_dataset, 'processed_data.csv')
    print("\nData successfully saved!")
except Exception as e:
    print(f"Error during dataset creation: {str(e)}")

Amazon DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Uniq Id                50 non-null     object 
 1   Crawl Timestamp        50 non-null     object 
 2   Billing Uniq Id        50 non-null     object 
 3   Rating                 0 non-null      float64
 4   Review Title           50 non-null     object 
 5   Review Rating          50 non-null     float64
 6   Review Date            50 non-null     object 
 7   User Id                50 non-null     object 
 8   Brand                  50 non-null     object 
 9   Category               49 non-null     object 
 10  Sub Category           49 non-null     object 
 11  Product Description    49 non-null     object 
 12  Asin                   50 non-null     object 
 13  Url                    50 non-null     object 
 14  Review Content         50 non-null   

In [52]:
# 6. Save Processed Data
def save_processed_data(df, filename):
    df.to_csv(f'../data/processed/{filename}', index=False)

In [53]:
# Load all datasets
amazon_df = load_amazon_reviews()
content_df = load_content_based()
reviews_df = load_reviews_ratings()

In [45]:
# Process each dataset
try:
    processed_amazon = process_amazon_reviews(amazon_df)
    processed_content = process_content_based(content_df)
    print("Processing completed successfully!")
    
    # Print some information about the processed datasets
    print("\nProcessed Amazon Reviews shape:", processed_amazon.shape)
    print("Processed Content-based shape:", processed_content.shape)
except Exception as e:
    print(f"Error during processing: {str(e)}")

Available columns: ['Number of clicks on similar products', 'Number of similar products purchased so far', 'Average rating given to similar products', 'Gender', 'Median purchasing price (in rupees)', 'Rating of the product', 'Brand of the product', 'Customer review sentiment score (overall)', 'Price of the product', 'Holiday', 'Season', 'Geographical locations', 'Probability for the product to be recommended to the person']
Numerical columns identified: ['Number of clicks on similar products', 'Number of similar products purchased so far', 'Average rating given to similar products', 'Median purchasing price (in rupees)', 'Rating of the product', 'Customer review sentiment score (overall)', 'Price of the product', 'Probability for the product to be recommended to the person']
Processing completed successfully!

Processed Amazon Reviews shape: (50, 21)
Processed Content-based shape: (1474, 13)


In [None]:
# Let's examine our processed datasets
print("\nAmazon Reviews Dataset:")
print("----------------------")
print("Columns:", processed_amazon.columns.tolist())
print("\nSample data:")
print(processed_amazon.head(2))

print("\nContent-based Dataset:")
print("----------------------")
print("Columns:", processed_content.columns.tolist())
print("\nSample data:")
print(processed_content.head(2))

# Basic statistics of numerical columns
print("\nNumerical Statistics for Content-based Dataset:")
print("--------------------------------------------")
numerical_cols = processed_content.select_dtypes(include=['int64', 'float64']).columns
print(processed_content[numerical_cols].describe())

# Check for any missing values
print("\nMissing Values Check:")
print("-------------------")
print("Amazon Reviews Dataset:\n", processed_amazon.isnull().sum())
print("\nContent-based Dataset:\n", processed_content.isnull().sum())

In [54]:
# Create and save final dataset
final_dataset = create_final_dataset(processed_amazon, processed_content, reviews_df)
save_processed_data(final_dataset, 'processed_data.csv')

Amazon DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Uniq Id                50 non-null     object 
 1   Crawl Timestamp        50 non-null     object 
 2   Billing Uniq Id        50 non-null     object 
 3   Rating                 0 non-null      float64
 4   Review Title           50 non-null     object 
 5   Review Rating          50 non-null     float64
 6   Review Date            50 non-null     object 
 7   User Id                50 non-null     object 
 8   Brand                  50 non-null     object 
 9   Category               49 non-null     object 
 10  Sub Category           49 non-null     object 
 11  Product Description    49 non-null     object 
 12  Asin                   50 non-null     object 
 13  Url                    50 non-null     object 
 14  Review Content         50 non-null   

In [55]:
# Also save intermediate processed datasets
save_processed_data(processed_amazon, 'processed_amazon.csv')
save_processed_data(processed_content, 'processed_content.csv')

print("Data processing complete! Check the data/processed/ directory for output files.")

Data processing complete! Check the data/processed/ directory for output files.
