In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
df = pd.read_csv("online_retail.csv", encoding='latin1')

df.head()


In [None]:
df = df.dropna(subset=['CustomerID'])


In [None]:
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]


In [None]:
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]


In [None]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])


In [None]:
country_sales = df.groupby('Country')['Quantity'].sum().sort_values(ascending=False).head(10)

plt.figure()
country_sales.plot(kind='bar')
plt.title("Top 10 Countries by Sales")
plt.show()


In [None]:
top_products = df.groupby('Description')['Quantity'].sum().sort_values(ascending=False).head(10)

plt.figure()
top_products.plot(kind='bar')
plt.title("Top 10 Selling Products")
plt.show()


In [None]:
df['InvoiceDate'] = df['InvoiceDate'].dt.date
daily_sales = df.groupby('InvoiceDate')['Quantity'].sum()

plt.figure()
daily_sales.plot()
plt.title("Daily Sales Trend")
plt.show()


In [None]:
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']


In [None]:
reference_date = df['InvoiceDate'].max()


In [None]:
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (reference_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalPrice': 'sum'
})

rfm.columns = ['Recency', 'Frequency', 'Monetary']
rfm.head()


In [None]:
rfm.hist(figsize=(10,6))
plt.show()


In [None]:
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)


In [None]:
inertia = []

for k in range(2, 11):
    km = KMeans(n_clusters=k, random_state=42)
    km.fit(rfm_scaled)
    inertia.append(km.inertia_)

plt.figure()
plt.plot(range(2,11), inertia, marker='o')
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()


In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)


In [None]:
silhouette_score(rfm_scaled, rfm['Cluster'])


In [None]:
rfm.groupby('Cluster').mean()


In [None]:
cluster_labels = {
    0: 'High-Value',
    1: 'Regular',
    2: 'Occasional',
    3: 'At-Risk'
}

rfm['Segment'] = rfm['Cluster'].map(cluster_labels)
rfm.head()


In [None]:
plt.figure()
sns.scatterplot(
    x=rfm['Recency'],
    y=rfm['Monetary'],
    hue=rfm['Segment']
)
plt.title("Customer Segments")
plt.show()


In [None]:
pivot = df.pivot_table(
    index='CustomerID',
    columns='Description',
    values='Quantity',
    fill_value=0
)


In [None]:
similarity = cosine_similarity(pivot.T)
similarity_df = pd.DataFrame(similarity,
                             index=pivot.columns,
                             columns=pivot.columns)


In [None]:
def recommend_products(product_name, n=5):
    if product_name not in similarity_df.index:
        return "Product not found"

    scores = similarity_df[product_name].sort_values(ascending=False)[1:n+1]
    return scores.index.tolist()


In [None]:
recommend_products("WHITE HANGING HEART T-LIGHT HOLDER")
