#

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

 Load and Preprocess Data

In [None]:
@st.cache_data
def load_data():
    df = pd.read_csv("C:\\Users\\sanyam mahajan\\Documents\\Mpr\\scraping\\data_eda.csv")
    
    # Clean Price Range
    df['Price Range'] = df['Price Range'].str.replace('â_x0080_x0093', '-', regex=True)
    
    # Calculate Value for Money properly
    df['Value for Money'] = df['Rating'] / df['Price']
    
    # Create Discount Efficiency metric
    df['Discount Efficiency'] = df['Discount Amount'] / df['Price']
    
    # Price Segmentation
    bins = [0, 500, 1000, 2000, float('inf')]
    labels = ['Budget', 'Mid-Range', 'Premium', 'Luxury']
    df['Price Segment'] = pd.cut(df['Price'], bins=bins, labels=labels)
    
    return df

df = load_data()

Dashboard Setup

In [None]:
st.set_page_config(page_title="Flipkart Headphones Analysis", layout="wide")
st.title("🎧 Comprehensive Bluetooth Headphones Analysis")


Sidebar Filters

 Apply Filters

In [None]:
filtered_df = df[
    (df['Brand'].isin(selected_brands) if selected_brands else True) &
    (df['Price'].between(*price_range)) &
    (df['Rating'].between(*rating_filter)) &
    (df['Discount %'].between(*discount_filter)) &
    ((df['Product Type'] == product_type) if product_type != 'All' else True) &
    (df['Price Segment'].isin(price_segment) if price_segment else True)
]\

# Section 1: Core Metrics

In [None]:
st.header("📊 Core Market Metrics")
kpi1, kpi2, kpi3, kpi4, kpi5 = st.columns(5)

with kpi1:
    st.metric("Avg Price", f"₹{filtered_df['Price'].mean():.1f}", 
             delta=f"₹{filtered_df['Price'].mean() - df['Price'].mean():.1f} vs Overall")
    
with kpi2:
    st.metric("Premium Index", 
             f"{(len(filtered_df[filtered_df['Price Segment'].isin(['Premium','Luxury'])])/len(filtered_df))*100:.1f}%",
             "Premium+Luxury Share")

with kpi3:
    st.metric("Discount Power", 
             f"₹{filtered_df['Discount Amount'].sum():,.0f}",
             "Total Discounts Given")

with kpi4:
    st.metric("Rating Power", 
             f"{filtered_df['Rating'].mean():.1f}/5 ⭐",
             f"Based on {filtered_df['Number of Ratings'].sum():,} reviews")

with kpi5:
    st.metric("Market Diversity", 
             f"{filtered_df['Brand'].nunique()} Brands",
             f"{filtered_df['Product Type'].nunique()} Categories")


# Section 2: Brand Analysis

In [None]:
st.header("🏷 Brand Intelligence")
col1, col2, col3 = st.columns([2,1,1])

with col1:
    # Brand Market Share
    brand_market = filtered_df.groupby('Brand').agg({
        'Price': 'sum',
        'Number of Ratings': 'sum',
        'Discount %': 'mean'
    }).reset_index()
    
    fig = px.treemap(brand_market,
                    path=['Brand'],
                    values='Price',
                    color='Discount %',
                    hover_data=['Number of Ratings'],
                    color_continuous_scale='RdYlGn',
                    title="Brand Market Share (Size=Revenue, Color=Discounts)")
    st.plotly_chart(fig, use_container_width=True)

with col2:
    # Brand Rating Distribution
    fig = px.box(filtered_df, 
                x='Brand', 
                y='Rating',
                color='Brand',
                title="Brand Rating Distribution")
    st.plotly_chart(fig, use_container_width=True)

with col3:
    # Brand-Category Matrix
    cross_tab = pd.crosstab(filtered_df['Brand'], filtered_df['Product Type'])
    fig = px.imshow(cross_tab,
                   labels=dict(x="Category", y="Brand", color="Count"),
                   title="Brand vs Category Matrix")
    st.plotly_chart(fig, use_container_width=True)


# Section 3: Price Analysis

In [None]:
st.header("💰 Price Optimization Analysis")
col1, col2 = st.columns(2)

with col1:
    # Price-Rating-Discount 3D Analysis
    fig = px.scatter_3d(filtered_df,
                       x='Price',
                       y='Rating',
                       z='Discount %',
                       color='Brand',
                       size='Number of Ratings',
                       hover_name='Product Name',
                       title="3D Pricing Strategy Analysis")
    st.plotly_chart(fig, use_container_width=True)

with col2:
    # Price Segment Analysis
    segment_analysis = filtered_df.groupby('Price Segment').agg({
        'Price': 'mean',
        'Rating': 'mean',
        'Discount %': 'mean'
    }).reset_index()
    
    fig = px.bar(segment_analysis,
                x='Price Segment',
                y=['Price', 'Rating', 'Discount %'],
                barmode='group',
                title="Price Segment Comparison")
    st.plotly_chart(fig, use_container_width=True)

# Section 4: Discount Deep Dive

In [None]:
st.header("🎯 Discount Effectiveness")
col1, col2 = st.columns(2)

with col1:
    # Discount vs Sales Performance
    fig = px.scatter(filtered_df,
 a                  x='Discount %',
                   y='Number of Ratings',
                   size='Price',
                   color='Rating',
                   trendline="lowess",
                   title="Discount Impact on Popularity & Ratings")
    st.plotly_chart(fig, use_container_width=True)

with col2:
    # Discount Strategy Matrix
    fig = px.density_heatmap(filtered_df,
                           x='Price',
                           y='Discount %',
                           nbinsx=20,
                           nbinsy=20,
                           title="Discount-Price Density Matrix")
    st.plotly_chart(fig, use_container_width=True)


# Section 5: Product Intelligence

In [None]:
st.subheader("Product Comparison Matrix")
selected_products = st.multiselect("Select products", 
                                  options=filtered_df['Product Name'],
                                  format_func=lambda x: f"{x[:50]}... ({df[df['Product Name']==x]['Brand'].values[0]})")

if selected_products:
    compare_df = filtered_df[filtered_df['Product Name'].isin(selected_products)]
    
    # Normalize features for radar chart
    features = compare_df[['Price', 'Rating', 'Discount %', 'Number of Ratings']]
    scaler = MinMaxScaler()
    normalized = scaler.fit_transform(features)
    
    fig = go.Figure()
    
    for i, row in enumerate(normalized):
        fig.add_trace(go.Scatterpolar(
            r=row,
            theta=['Price', 'Rating', 'Discount %', 'Popularity'],
            fill='toself',
            name=selected_products[i][:30] + "..." if i < len(selected_products) else f"Product {i+1}"
        ))

    
    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True)),
        title="Product Comparison Radar Chart"
    )
    st.plotly_chart(fig, use_container_width=True)

# Section 6: Advanced Analytics

In [None]:
st.header("🔮 Predictive Insights")

col1, col2 = st.columns(2)

with col1:
    # Price vs Rating Predictive Trend
    fig = px.scatter(filtered_df,
                   x='Price',
                   y='Rating',
                   trendline="lowess",
                   title="Price-Rating Relationship with Trend")
    st.plotly_chart(fig, use_container_width=True)

with col2:
    # Value for Money Analysis
    fig = px.box(filtered_df,
                x='Price Segment',
                y='Value for Money',
                color='Product Type',
                title="Value for Money Analysis")
    st.plotly_chart(fig, use_container_width=True)

# Section 7: Raw Data Explorer

In [None]:
st.header("📁 Data Explorer")
st.data_editor(
    filtered_df.sort_values('Number of Ratings', ascending=False),
    column_config={
        "Product Link": st.column_config.LinkColumn(),
        "Discount %": st.column_config.ProgressColumn(
            format="%d%%",
            min_value=0,
            max_value=100,
        )
    },
    hide_index=True,
    use_container_width=True
)

# Section 8: Machine Learning Models

In [None]:
st.header("🤖 Practical ML Implementations")

1. Price Prediction Model (Regression)

In [None]:
st.subheader("💰 Price Prediction Engine")
with st.expander("Train Price Predictor"):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import r2_score, mean_absolute_error
    
    # Feature Engineering
    ml_df = filtered_df[['Brand', 'Product Type', 'Rating', 'Discount %', 
                        'Number of Ratings', 'Price']].copy()
    
    # Convert categorical features
    ml_df = pd.get_dummies(ml_df, columns=['Brand', 'Product Type'])
    
    X = ml_df.drop('Price', axis=1)
    y = ml_df['Price']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestRegressor(n_estimators=100)
    model.fit(X_train, y_train)
    
    # Prediction and Evaluation
    y_pred = model.predict(X_test)
    accuracy = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Visualization
    fig = px.scatter(x=y_test, y=y_pred, 
                    labels={'x': 'Actual Price', 'y': 'Predicted Price'},
                    title=f"Price Prediction Performance (R² = {accuracy:.2f}, MAE = ₹{mae:.1f})")
    st.plotly_chart(fig)
    
    # Feature Importance
    importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    fig = px.bar(importance.head(10), x='Feature', y='Importance', 
                title='Top 10 Important Features for Price Prediction')
    st.plotly_chart(fig)


2. Rating Classifier (Classification)

In [None]:
st.subheader("⭐ Rating Category Predictor")
with st.expander("Predict Rating Category"):
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.metrics import classification_report
    
    # Create rating categories
    rating_df = filtered_df.copy()
    rating_df['Rating Category'] = pd.cut(rating_df['Rating'],
                                         bins=[0, 3.5, 4.2, 5],
                                         labels=['Low', 'Medium', 'High'])
    
    # Feature selection
    X = rating_df[['Price', 'Discount %', 'Brand', 'Product Type', 'Number of Ratings']]
    X = pd.get_dummies(X, columns=['Brand', 'Product Type'])
    y = rating_df['Rating Category']
    # Remove rows where y is NaN
    X = X[y.notna()]
    y = y.dropna()

    
    # Train model
    clf = GradientBoostingClassifier()
    clf.fit(X, y)
    
    # Live prediction interface
    st.markdown("*Predict for New Product:*")
    col1, col2, col3 = st.columns(3)
    with col1:
        price = st.number_input("Price (₹)", min_value=0)
    with col2:
        discount = st.slider("Discount (%)", 0, 100)
    with col3:
        brand = st.selectbox("Brand", filtered_df['Brand'].unique())
    
    product_type = st.selectbox("Product Type", filtered_df['Product Type'].unique())
    num_ratings = st.number_input("Expected Number of Ratings", min_value=0)
    
    if st.button("Predict Rating Category"):
        input_data = pd.DataFrame([[price, discount, brand, product_type, num_ratings]],
                                 columns=['Price', 'Discount %', 'Brand', 'Product Type', 'Number of Ratings'])
        input_encoded = pd.get_dummies(input_data)
        
        # Align columns
        input_encoded = input_encoded.reindex(columns=X.columns, fill_value=0)
        
        prediction = clf.predict(input_encoded)[0]
        probability = clf.predict_proba(input_encoded)[0].max()
        
        st.success(f"Predicted Rating Category: *{prediction}* (Confidence: {probability:.0%})")


3. Product Clustering (Unsupervised Learning)

In [None]:

st.subheader("🔍 Product Clustering Analysis")
with st.expander("Explore Product Segments"):
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler
    
    # Prepare data
    cluster_df = filtered_df[['Price', 'Rating', 'Discount %', 'Number of Ratings']]
    
    # Normalization
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(cluster_df)
    
    # Determine optimal clusters
    wcss = []
    for i in range(1, 6):
        kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10)
        kmeans.fit(scaled_data)
        wcss.append(kmeans.inertia_)
    
    # Elbow method visualization
    fig1 = px.line(x=range(1,6), y=wcss, 
                  title='Elbow Method for Optimal Clusters',
                  labels={'x': 'Number of Clusters', 'y': 'WCSS'})
    st.plotly_chart(fig1)
    
    # Final clustering
    n_clusters = st.slider("Select Number of Clusters", 2, 5, 3)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(scaled_data)
    
    # Visualize clusters
    cluster_df['Cluster'] = clusters
    fig2 = px.scatter_matrix(cluster_df,
                            dimensions=['Price', 'Rating', 'Discount %', 'Number of Ratings'],
                            color='Cluster',
                            title="Product Cluster Analysis")
    st.plotly_chart(fig2)

4. Discount Effectiveness Predictor

In [None]:
st.subheader("🎯 Discount Impact Analyzer")
with st.expander("Predict Discount Impact"):
    from sklearn.linear_model import LogisticRegression
    
    # Create target variable (High Popularity)
    discount_df = filtered_df.copy()
    discount_df['High Popularity'] = np.where(discount_df['Number of Ratings'] > 
                                            discount_df['Number of Ratings'].median(), 1, 0)
    
    # Prepare data
    X = discount_df[['Price', 'Discount %', 'Brand', 'Product Type']]
    X = pd.get_dummies(X)
    y = discount_df['High Popularity']
    
    # Train model
    model = LogisticRegression()
    model.fit(X, y)
    
    # Prediction interface
    st.markdown("*Will this discount strategy work?*")
    col1, col2 = st.columns(2)
    with col1:
        disc_price = st.number_input("Product Price (₹)", min_value=0)
    with col2:
        disc_pct = st.slider("Planned Discount (%)", 0, 100)
    
    disc_brand = st.selectbox("Product Brand", filtered_df['Brand'].unique())
    disc_type = st.selectbox("Product Type", filtered_df['Product Type'].unique(), key="product_type_selectbox")
    
    

    
    if st.button("Predict Popularity"):
        input_data = pd.DataFrame([[disc_price, disc_pct, disc_brand, disc_type]],
                                 columns=['Price', 'Discount %', 'Brand', 'Product Type'])
        input_encoded = pd.get_dummies(input_data)
        input_encoded = input_encoded.reindex(columns=X.columns, fill_value=0)
        
        prediction = model.predict(input_encoded)[0]
        proba = model.predict_proba(input_encoded)[0][1]
        
        if prediction == 1:
            st.success(f"High Popularity Expected ({proba:.0%} confidence)")
        else:
            st.error(f"Low Popularity Risk ({1-proba:.0%} confidence)")