In [2]:
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

# Function to load data
def load_data():
    train_data = pd.read_csv('https://raw.githubusercontent.com/subhasishsinha12/MLProjects/main/existing_base_train.csv')
    test_data = pd.read_csv('https://raw.githubusercontent.com/subhasishsinha12/MLProjects/main/existing_base_test.csv')
    return train_data, test_data

# Function to preprocess data
def preprocess_data(data):
    data.drop(['REF_NO', 'post_code', 'post_area', 'region'], axis=1, inplace=True)
    categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
    numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
    numerical_cols.remove('Revenue_Grid')
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])
    
    X = preprocessor.fit_transform(data.drop('Revenue_Grid', axis=1))
    y = data['Revenue_Grid']
    return X, y

# Load data
train_data, test_data = load_data()

# Split train data for modeling
X, y = preprocess_data(train_data)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
decision_tree = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
random_forest = RandomForestClassifier(random_state=42).fit(X_train, y_train)

# Streamlit app
st.title("Retail Banking Customer Analysis")

# Data overview
if st.checkbox("Show Data Overview"):
    st.write(train_data.head())

# Data visualization
if st.checkbox("Visualize Data"):
    st.subheader("Target Variable Distribution")
    fig, ax = plt.subplots()
    sns.countplot(x='Revenue_Grid', data=train_data)
    st.pyplot(fig)

# Model predictions and evaluation
if st.checkbox("Show Model Predictions"):
    st.subheader("Decision Tree Predictions")
    y_pred_dt = decision_tree.predict(X_val)
    st.write(y_pred_dt)

    st.subheader("Random Forest Predictions")
    y_pred_rf = random_forest.predict(X_val)
    st.write(y_pred_rf)

# Footer
st.markdown("---")
st.write("Retail Banking Customer Analysis App")

# To run the app, use `streamlit run app.py` in your terminal