In [1]:
# 🧠 AI-Driven Recommendation Systems and Consumer Behavior in E-Commerce
## Objective
Understand how AI-powered recommendation systems influence consumer purchasing behavior and reduce time spent shopping.

## Data Sources
- [Kaggle - Multi-category store behavior data](https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store)
- Amazon product reviews & metadata (via API or Kaggle)

## Key Questions
- How do recommendations affect buying decisions?
- Can we segment users based on behavior?
- How does user behavior differ with and without AI recommendations?

SyntaxError: invalid syntax (2823770420.py, line 3)

In [2]:
# Table of Contents
# 1. Introduction and Background
# 2. Data Loading and Initial Exploration
# 3. Data Cleaning and Preprocessing
# 4. Exploratory Data Analysis
# 5. User Segmentation (Clustering)
# 6. Recommendation System Evaluation
# 7. A/B Testing Simulation
# 8. Time Series Analysis
# 9. NLP on Product Reviews
# 10. Initial Model Development
# 11. Conclusions and Next Steps

In [3]:
#Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import json
import warnings

In [9]:
# For advanced analysis
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# For NLP
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
# Set visualization style
plt.style.use('seaborn-whitegrid')
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')

In [None]:
# ## 1. Introduction and Background

# This project focuses on analyzing e-commerce behavior data to understand how AI-driven recommendations influence shopping behavior. We aim to explore:
# 
# - Product browsing and purchase trends
# - User segmentation using clustering techniques
# - Effectiveness of recommendation systems
# - Impact of AI recommendations on purchase likelihood and shopping time
# - Seasonal trends in shopping behavior
# - Sentiment analysis of product reviews
# 
# The project uses a comprehensive e-commerce dataset from a multi-category store, available on Kaggle.

# ## 2. Data Loading and Initial Exploration


In [11]:
url = 'https://data.rees46.com/datasets/marketplace/2020-Apr.csv.gz'

# Load the dataset
try:
    df = pd.read_csv(file_path, nrows=1000000, compression='gzip')
    
    print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")
except Exception as e:
    print(f"Error loading dataset: {e}")

df.head()
df.info()
df.describe()

Dataset loaded successfully with 1000000 rows and 9 columns.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   event_time     1000000 non-null  object 
 1   event_type     1000000 non-null  object 
 2   product_id     1000000 non-null  int64  
 3   category_id    1000000 non-null  int64  
 4   category_code  915317 non-null   object 
 5   brand          895048 non-null   object 
 6   price          1000000 non-null  float64
 7   user_id        1000000 non-null  int64  
 8   user_session   999994 non-null   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 68.7+ MB


Unnamed: 0,product_id,category_id,price,user_id
count,1000000.0,1000000.0,1000000.0,1000000.0
mean,32835680.0,2.172217e+18,311.880428,575229200.0
std,41870560.0,8.479174e+16,369.086812,45058750.0
min,1000978.0,2.053014e+18,0.0,104655800.0
25%,1307331.0,2.053014e+18,69.32,529669100.0
50%,8800773.0,2.232732e+18,187.91,573872100.0
75%,100011100.0,2.232732e+18,398.72,621063300.0
max,100186000.0,2.253831e+18,2574.07,635400100.0


In [13]:
missing_values = df.isnull().sum()
missing_percent = (missing_values/ len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing_values,'Percentage': missing_percent})
missing_df[missing_df['Missing Values'] > 0]

Unnamed: 0,Missing Values,Percentage
category_code,84683,8.4683
brand,104952,10.4952
user_session,6,0.0006
