# Customer Segmentation\n\nGenerated by Auto-Analysis Web App

## Data Cleaning\nHandle missing values and remove duplicates to ensure data quality.

In [None]:

# Drop duplicates
df = df.drop_duplicates()

# Handle missing values (Simple Imputation)
numeric_cols = df.select_dtypes(include=['number']).columns
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())
    
# Fill categorical missing values with mode
cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])
        

## Feature Engineering: RFM\nCreate Recency, Frequency, and Monetary value features from transaction data.

In [None]:

# Identify columns (Heuristic)
date_col = [c for c in df.columns if 'date' in c.lower() or 'time' in c.lower()][0]
amount_col = [c for c in df.columns if 'amount' in c.lower() or 'price' in c.lower() or 'spend' in c.lower()][0]
id_col = [c for c in df.columns if 'id' in c.lower() or 'customer' in c.lower()][0]

# Convert to datetime
df[date_col] = pd.to_datetime(df[date_col])

# Calculate RFM
snapshot_date = df[date_col].max() + pd.Timedelta(days=1)
rfm = df.groupby(id_col).agg({
    date_col: lambda x: (snapshot_date - x.max()).days,
    id_col: 'count',
    amount_col: 'sum'
}).rename(columns={
    date_col: 'Recency',
    id_col: 'Frequency',
    amount_col: 'Monetary'
})
df = rfm # Switch to RFM dataframe for clustering
            

## Data Scaling\nNormalize features using StandardScaler to ensure equal weighting.

In [None]:

from sklearn.preprocessing import StandardScaler

# Select numeric features for clustering
X = df.select_dtypes(include=['number'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
        

## PCA (Dimensionality Reduction)\nReduce dimensions to 2 components for visualization and noise reduction.

In [None]:

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df['PCA1'] = X_pca[:, 0]
df['PCA2'] = X_pca[:, 1]
        

## K-Means Clustering\nSegment data into clusters using K-Means.

In [None]:

from sklearn.cluster import KMeans

# Determine optimal clusters (simplified for now, fixed to 3)
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_scaled)
        

## Cluster Visualization\nVisualize clusters using PCA components.

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df, palette='viridis')
plt.title('Customer Segments (PCA)')
plt.show()
        

## New Analysis Step\nDescribe your new analysis step here.

In [None]:
# Add your python code here
