**Importing the dependencies**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# import all classifications models 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier ,GradientBoostingClassifier


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle

In [None]:
# !pip install xgboost
# !pip install lightgbm

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

**Data Collection and Processing**

In [None]:
# laod the dataset to a pandas dataframe
# data = pd.read_csv("Rainfall.csv")

data = pd.read_csv(r'https://media.githubusercontent.com/media/shahil04/ds_materials/refs/heads/main/8.0_Machine%20Learning/ml_class/ml_projects/rain_fall_predictions/Rainfall.csv')
data.head()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
# cheack null 
data.isnull().sum()

In [None]:
data = data.dropna()

In [None]:
data.columns

In [None]:
# remove extra  spaces in all columns
data.columns = data.columns.str.strip()

In [None]:
data.columns

In [None]:
print("Data Info:")
data.info()

In [None]:
data.head()
# show the stating data where day are for jan month

In [None]:
data["day"].unique()

In [None]:
data = data.drop(columns=["day"])

In [None]:
data.head()

In [None]:
# checking the number of missing values
print(data.isnull().sum())

In [None]:
# converting the yes & no to 1 and 0 respectively
data["rainfall"] = data["rainfall"].map({"yes": 1, "no": 0})

In [None]:
data.head()

**Exploratory Data Analysis (EDA)**

In [None]:
data.shape

In [None]:
# setting plot style for all the plots
sns.set(style="whitegrid")

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
# check the distributions using histograms
plt.figure(figsize=(15, 10))

for i, column in enumerate(['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity','cloud', 'sunshine', 'windspeed'], 1):
  plt.subplot(3, 3, i)
  sns.histplot(data[column], kde=True)
  plt.title(f"Distribution of {column}")

plt.tight_layout()
plt.show()

In [None]:
# show the label data Y count
plt.figure(figsize=(6, 4))
sns.countplot(x="rainfall", data=data)
plt.title("Distribution of Rainfall")
plt.show()

In [None]:
data.corr()

In [None]:
# correlation matrix --> for check the relationship bw x(features ) and y(label)
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation heatmap")
plt.show()

In [None]:
# check the outliers  
plt.figure(figsize=(15, 10))

for i, column in enumerate(['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity','cloud', 'sunshine', 'windspeed'], 1):
  plt.subplot(3, 3, i)
  sns.boxplot(data[column])
  plt.title(f"Boxplot of {column}")

plt.tight_layout()
plt.show()

In [None]:
# remove outliers
data[data['pressure']>1030]

data = data[data['pressure']<1030]

**Data Preprocessing**

In [None]:
# drop highly correlated column bewtween x feature --> colinearity
data = data.drop(columns=['maxtemp', 'temparature', 'mintemp'])

In [None]:
data.head()

In [None]:
print(data["rainfall"].value_counts())

In [None]:
# separate majority and minority class
df_majority = data[data["rainfall"] == 1]

df_minority = data[data["rainfall"] == 0]

In [None]:
print(df_majority.shape)
print(df_minority.shape)

In [None]:
# downsample majority class to match minority count
df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)

In [None]:
df_majority_downsampled.shape

In [None]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [None]:
df_downsampled.shape

In [None]:
len(df_downsampled)

In [None]:
df_downsampled.head()

In [None]:
# shuffle the final dataframe
df_downsampled = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df_downsampled.head()

In [None]:
df_downsampled["rainfall"].value_counts()

In [None]:
# split features and target as X and y
X = df_downsampled.drop(columns=["rainfall"])
y = df_downsampled["rainfall"]

In [None]:
print(X)

In [None]:
print(y)

In [None]:
# splitting the data into training data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Model Training**

In [None]:
rf_model = RandomForestClassifier(random_state=42)

param_grid_rf = {
    "n_estimators": [50, 100, 200],
    "max_features": ["sqrt", "log2"],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

In [None]:
# Hypertuning using GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=2, n_jobs=-1, verbose=3)

grid_search_rf.fit(X_train, y_train)

In [None]:
best_rf_model = grid_search_rf.best_estimator_

print("best parameters for Random Forest:", grid_search_rf.best_params_)

**Model Evaluation**

In [None]:
cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))

In [None]:
# test set performance
y_pred = best_rf_model.predict(X_test)

print("Test set Accuracy:", accuracy_score(y_test, y_pred))
print("Test set Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

### apply all models

In [None]:
models = {
    'lr': LogisticRegression(),
    'svc': SVC(),
    'knn': KNeighborsClassifier(),
    'nb': GaussianNB(),
    'dt': DecisionTreeClassifier(max_depth=4),
    'rf': RandomForestClassifier(),
    'ada': AdaBoostClassifier(),
    'gb': GradientBoostingClassifier(),
    'xgb': XGBClassifier(),
    'lgbm': LGBMClassifier()
}

accuracy_scores = {}
for  name, model in models.items():
  print(name, model)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  # print(name, accuracy_score(y_test,y_pred))
  accuracy_scores[name] = accuracy_score(y_test,y_pred)
  print('===================================')



In [None]:
# show all 
accuracy_scores

In [None]:
# now  use best models without tuning

best_model = DecisionTreeClassifier(max_depth=4)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

In [None]:
# tuning 

# Parameter grids for each model
param_grids = {
    'lr': {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1, 10],
        'solver': ['liblinear']
    },
    'svc': {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf', 'poly']
    },
    'knn': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    },
    'nb': {
        # GaussianNB has no major hyperparameters
        'var_smoothing': [1e-09, 1e-08, 1e-07]
    },
    'dt': {
        'max_depth': [2, 4, 6, 8],
        'criterion': ['gini', 'entropy']
    },
    'rf': {
        'n_estimators': [100, 200],
        'max_depth': [4, 6, 8, None],
        'min_samples_split': [2, 5]
    },
    'ada': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1]
    },
    'gb': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 4, 5]
    },
    'xgb': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 4, 5],
        'subsample': [0.8, 1]
    },
    'lgbm': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'num_leaves': [31, 50, 70]
    }
}

# To store results
best_models = {}
best_scores = {}
best_params = {}

# LOOP grid search for each model
for name, model in models.items():
    print(f"\nüîç Running GridSearchCV for: {name.upper()}")

    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        verbose=0
    )

    grid.fit(X_train, y_train)

    best_models[name] = grid.best_estimator_
    best_scores[name] = grid.best_score_
    best_params[name] = grid.best_params_

    print(f"‚û° Best Score: {grid.best_score_}")
    print(f"‚û° Best Params: {grid.best_params_}")

print("\n============================")
print(" FINAL COMPARISON SUMMARY ")
print("============================")

for name in best_models:
    print(f"\nModel: {name.upper()}")
    print(f"Best CV Score: {best_scores[name]}")
    print(f"Best Params: {best_params[name]}")


## **Prediction on unknown data** 

or take input from users

In [None]:
pressure = float(input("enter the pressure : "))
dewpoint = float(input("enter the pressure : "))

pd.DataFrame([pressure,dewpoint])
# rainfall_prediction_model.pkl"

In [None]:
input_data = (1015.9, 19.9, 95, 81, 0.0, 40.0, 13.7)

input_df = pd.DataFrame([input_data], columns=['pressure', 'dewpoint', 'humidity', 'cloud', 'sunshine','winddirection', 'windspeed'])

In [None]:
input_df

In [None]:
prediction = best_rf_model.predict(input_df)

In [None]:
print(prediction)

In [None]:
prediction[0]

In [None]:
prediction = best_rf_model.predict(input_df)
print("Prediction result:", "yes Rainfall happen " if prediction[0] == 1 else "No Rainfall not happen")

In [None]:
# save model and feature names to a pickle file
model_data = {"model": best_rf_model, "feature_names": X.columns.tolist()}

with open("rainfall_prediction_model.pkl", "wb") as file:
  pickle.dump(model_data, file)

**Load the saved model and file and use it for prediction**

In [None]:
import pickle
import pandas as pd

In [None]:
# load the trained model and feature names from the pickle file
with open("rainfall_prediction_model.pkl", "rb") as file:
  model_data = pickle.load(file)

In [None]:
model = model_data["model"]
feature_names = model_data["feature_names"]

In [None]:
input_data = (1015.9, 19.9, 95, 81, 0.0, 40.0, 13.7)

input_df = pd.DataFrame([input_data], columns=feature_names)

In [None]:
prediction = best_rf_model.predict(input_df)
print("Prediction result:", "Rainfall" if prediction[0] == 1 else "No Rainfall")

In [None]:
# 30dc3ec276c49ac523473c3cfb2848e0

https://api.openweathermap.org/data/3.0/onecall?lat={lat}&lon={lon}&exclude={part}&appid={API key}

In [82]:
import requests

API_KEY = "30dc3ec276c49ac523473c3cfb2848e0"   # Replace with your key

def get_coordinates(city_name):
    geocode_url = f"http://api.openweathermap.org/geo/1.0/direct?q={city_name}&limit=1&appid={API_KEY}"

    response = requests.get(geocode_url)
    data = response.json()

    if len(data) == 0:
        print("City not found!")
        return None, None
    
    lat = data[0]['lat']
    lon = data[0]['lon']
    return lat, lon

# Example
lat, lon = get_coordinates("Delhi")
print("Latitude:", lat, "Longitude:", lon)


Latitude: 28.6517178 Longitude: 77.2219388


In [None]:
import requests

API_KEY = "30dc3ec276c49ac523473c3cfb2848e0"   # Replace with your key

def get_data(city_name):
    geocode_url = f"http://api.openweathermap.org/geo/1.0/direct?q={city_name}&limit=1&appid={API_KEY}"

    response = requests.get(geocode_url)
    data = response.json()

    if len(data) == 0:
        print("City not found!")
        return None, None
    
    lat = data[0]['lat']
    lon = data[0]['lon']
    return lat, lon

# Example
lat, lon = get_data("Delhi")
print("Latitude:", lat, "Longitude:", lon)

geocode_url = f"https://api.openweathermap.org/data/3.0/onecall?lat={lat}&lon={lon}&appid={API_KEY}"

response = requests.get(geocode_url)
data = response.json()

if len(data) == 0:
    print("City not found!")

else:
    print(data)


Latitude: 28.6517178 Longitude: 77.2219388
{'cod': 401, 'message': 'Please note that using One Call 3.0 requires a separate subscription to the One Call by Call plan. Learn more here https://openweathermap.org/price. If you have a valid subscription to the One Call by Call plan, but still receive this error, then please see https://openweathermap.org/faq#error401 for more info.'}


In [85]:
import requests

API_KEY = "30dc3ec276c49ac523473c3cfb2848e0"

def get_data(city_name):
    geocode_url = f"http://api.openweathermap.org/geo/1.0/direct?q={city_name}&limit=1&appid={API_KEY}"

    response = requests.get(geocode_url)
    data = response.json()

    if len(data) == 0:
        print("City not found!")
        return None, None
    
    lat = data[0]['lat']
    lon = data[0]['lon']
    return lat, lon

# Get coordinates
lat, lon = get_data("Delhi")
print("Latitude:", lat, "Longitude:", lon)

# FREE 5-day / 3-hour forecast API
forecast_url = f"https://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&units=metric&appid={API_KEY}"

response = requests.get(forecast_url)
data = response.json()

# Check response
if "list" not in data:
    print("Error:", data)
else:
    print(data)
    # # Print first 5 forecast entries
    # for f in data["list"][:5]:
    #     print(
    #         f['dt_txt'],
    #         "Temp:", f['main']['temp'],
    #         "Humidity:", f['main']['humidity'],
    #         "Weather:", f['weather'][0]['description']
    #     )


Latitude: 28.6517178 Longitude: 77.2219388
{'cod': '200', 'message': 0, 'cnt': 40, 'list': [{'dt': 1763629200, 'main': {'temp': 25.06, 'feels_like': 24.77, 'temp_min': 25.06, 'temp_max': 26.86, 'pressure': 1016, 'sea_level': 1016, 'grnd_level': 990, 'humidity': 44, 'temp_kf': -1.8}, 'weather': [{'id': 800, 'main': 'Clear', 'description': 'clear sky', 'icon': '01d'}], 'clouds': {'all': 0}, 'wind': {'speed': 0.5, 'deg': 217, 'gust': 0.82}, 'visibility': 10000, 'pop': 0, 'sys': {'pod': 'd'}, 'dt_txt': '2025-11-20 09:00:00'}, {'dt': 1763640000, 'main': {'temp': 25.44, 'feels_like': 24.95, 'temp_min': 25.44, 'temp_max': 26.2, 'pressure': 1016, 'sea_level': 1016, 'grnd_level': 990, 'humidity': 35, 'temp_kf': -0.76}, 'weather': [{'id': 800, 'main': 'Clear', 'description': 'clear sky', 'icon': '01n'}], 'clouds': {'all': 0}, 'wind': {'speed': 0.17, 'deg': 301, 'gust': 0.81}, 'visibility': 10000, 'pop': 0, 'sys': {'pod': 'n'}, 'dt_txt': '2025-11-20 12:00:00'}, {'dt': 1763650800, 'main': {'temp':

In [None]:
# https://api.openweathermap.org/data/3.0/onecall?lat=33.44&lon=-94.04&appid={API key}

In [None]:
import requests
import pandas as pd

API_KEY = "30dc3ec276c49ac523473c3cfb2848e0"

# STEP 1: Get coordinates
def get_lat_lon(city):
    url = f"http://api.openweathermap.org/geo/1.0/direct?q={city}&limit=1&appid={API_KEY}"
    res = requests.get(url).json()
    return res[0]['lat'], res[0]['lon']

# STEP 2: Fetch forecast (FREE API)
def get_forecast(lat, lon):
    url = f"https://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&units=metric&appid={API_KEY}"
    return requests.get(url).json()

# STEP 3: Convert API ‚Üí ML model input features
def api_to_model_input(entry):
    temp = entry['main']['temp']
    humidity = entry['main']['humidity']
    pressure = entry['main']['pressure']
    cloud = entry['clouds']['all']
    windspeed = entry['wind']['speed']
    winddirection = entry['wind']['deg']

    # Calculate dewpoint
    dewpoint = temp - ((100 - humidity) / 5)

    # Estimate sunshine
    if cloud < 20:
        sunshine = 10
    elif cloud < 60:
        sunshine = 5
    else:
        sunshine = 1

    return [pressure, dewpoint, humidity, cloud, sunshine, winddirection, windspeed]


# =====================
# USE THE FUNCTIONS
# =====================

lat, lon = get_lat_lon("Delhi")
forecast = get_forecast(lat, lon)

# Take the **first forecast slot**
first_entry = forecast['list'][0]

# Convert to ML model features
input_data = api_to_model_input(first_entry)

# Create DataFrame
input_df = pd.DataFrame([input_data], 
    columns=['pressure', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed'])

print(input_df)

   pressure  dewpoint  humidity  cloud  sunshine  winddirection  windspeed
0      1016     14.26        41      0        10            217        0.5


In [None]:
model.predict(input_df)

NameError: name 'model' is not defined

**To Try:**
1. SMOTE for class balancing
2. PCA for dimensionality reduction
3. Simpler models like Logistic Regression (with Feature scaling)
4. Model Selection with hyperparameter tuning

Below is a **complete Streamlit project** you can run locally for **Rainfall Prediction** using your uploaded dataset (`Rainfall.csv`).
I‚Äôm giving you **full project structure + code for each file** so you can copy-paste and run immediately.

---

# ‚úÖ **üåßÔ∏è Rainfall Prediction Streamlit Project**

---

# üìÅ **Project Structure**

```
rainfall_prediction_app/
‚îÇ
‚îú‚îÄ‚îÄ app.py
‚îú‚îÄ‚îÄ model.py
‚îú‚îÄ‚îÄ requirements.txt
‚îî‚îÄ‚îÄ Rainfall.csv   ‚Üê your dataset
```

---

# üîÆ **1. model.py ‚Äî Train & Save Model**

```python
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib

def train_model():

    df = pd.read_csv("Rainfall.csv")

    # --- BASIC CLEANING (customize based on your dataset) ---
    df = df.dropna()

    # Assuming "Rainfall" is the target variable
    X = df.drop("Rainfall", axis=1)
    y = df["Rainfall"]

    # Convert categorical columns if present
    X = pd.get_dummies(X, drop_first=True)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    model = RandomForestRegressor()
    model.fit(X_train, y_train)

    joblib.dump((model, X.columns), "rainfall_model.pkl")

    return "Model trained & saved as rainfall_model.pkl"

if __name__ == "__main__":
    print(train_model())
```

---

# üé® **2. app.py ‚Äî Streamlit UI for Predictions**

```python
import streamlit as st
import pandas as pd
import joblib

st.title("üåßÔ∏è Rainfall Prediction App")
st.write("Upload input values and get rainfall predictions!")

# Load model
model, feature_names = joblib.load("rainfall_model.pkl")

# Sidebar Input Form
st.sidebar.header("Input Features")

input_data = {}

for feature in feature_names:
    input_data[feature] = st.sidebar.number_input(f"Enter {feature}", value=0.0)

# Convert to DataFrame
input_df = pd.DataFrame([input_data])

# Prediction
if st.button("Predict Rainfall"):
    prediction = model.predict(input_df)[0]
    st.success(f"üåßÔ∏è Predicted Rainfall: **{prediction:.2f} mm**")
```

---

# üì¶ **3. requirements.txt**

```
streamlit
pandas
scikit-learn
joblib
```

---

# ‚ñ∂Ô∏è **How to Run the App Locally**

### **Step 1: Install dependencies**

```
pip install -r requirements.txt
```

### **Step 2: Train the model**

```
python model.py
```

### **Step 3: Run Streamlit**

```
streamlit run app.py
```

---

# üéâ Your Rainfall Prediction App is Ready!

If you want:

‚úÖ Add graphs
‚úÖ Add EDA dashboard
‚úÖ Add model selection (RandomForest/XGBoost/Linear Regression)
‚úÖ Add UI styling
‚úÖ Deploy to Streamlit Cloud or Hugging Face

Just tell me ‚Äî I can upgrade this into a full production-level project.


Great! Below is the **fully upgraded Streamlit Rainfall Prediction Project** with:

‚úÖ **Model Selection**
‚ÄÉ‚Äî Random Forest
‚ÄÉ‚Äî XGBoost
‚ÄÉ‚Äî Linear Regression

‚úÖ **EDA Dashboard**
‚ÄÉ‚Äî Dataset preview
‚ÄÉ‚Äî Summary statistics
‚ÄÉ‚Äî Missing values
‚ÄÉ‚Äî Correlation heatmap
‚ÄÉ‚Äî Histograms & boxplots

‚úÖ **Graphs**
‚ÄÉ‚Äî Prediction vs Actual
‚ÄÉ‚Äî Feature importance

---

# üìÅ **UPDATED PROJECT STRUCTURE**

```
rainfall_prediction_app/
‚îÇ
‚îú‚îÄ‚îÄ app.py
‚îú‚îÄ‚îÄ train_model.py
‚îú‚îÄ‚îÄ requirements.txt
‚îî‚îÄ‚îÄ Rainfall.csv
```

---

# üìå **1. train_model.py (Full Model Training + Saving)**

```python
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
import joblib

def train_all_models():

    df = pd.read_csv("Rainfall.csv")
    df = df.dropna()

    y = df["Rainfall"]
    X = df.drop("Rainfall", axis=1)

    # One-Hot Encode categorical columns
    X = pd.get_dummies(X, drop_first=True)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    models = {
        "RandomForest": RandomForestRegressor(),
        "LinearRegression": LinearRegression(),
        "XGBoost": XGBRegressor(objective='reg:squarederror')
    }

    # Train & save each model
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        joblib.dump((model, X.columns), f"{model_name}.pkl")

    return "All models trained & saved!"

if __name__ == "__main__":
    print(train_all_models())
```

---

# üìå **2. app.py (Full Streamlit App: EDA + Model Selection + Prediction + Graphs)**

```python
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

# -------------------------------------------
# App Title
# -------------------------------------------
st.title("üåßÔ∏è Rainfall Prediction System")
st.write("Select model, explore EDA, and predict rainfall.")

# -------------------------------------------
# Load Dataset
# -------------------------------------------
df = pd.read_csv("Rainfall.csv")

# -------------------------------------------
# Sidebar Options
# -------------------------------------------
st.sidebar.header("Navigation")
options = st.sidebar.radio(
    "Go to",
    ["EDA Dashboard", "Prediction"]
)

# -------------------------------------------
# EDA Dashboard
# -------------------------------------------
if options == "EDA Dashboard":
    st.subheader("üìä Exploratory Data Analysis")

    st.write("### ‚û§ Dataset Preview")
    st.dataframe(df.head())

    st.write("### ‚û§ Summary Statistics")
    st.dataframe(df.describe())

    st.write("### ‚û§ Missing Values")
    st.dataframe(df.isnull().sum())

    st.write("### ‚û§ Correlation Heatmap")
    fig, ax = plt.subplots(figsize=(8, 6))
    sns.heatmap(df.corr(), annot=True, cmap="coolwarm", ax=ax)
    st.pyplot(fig)

    st.write("### ‚û§ Feature Distributions")
    for col in df.select_dtypes(include=np.number).columns:
        fig, ax = plt.subplots()
        sns.histplot(df[col], kde=True, ax=ax)
        st.pyplot(fig)

    st.write("### ‚û§ Boxplots")
    for col in df.select_dtypes(include=np.number).columns:
        fig, ax = plt.subplots()
        sns.boxplot(x=df[col], ax=ax)
        st.pyplot(fig)

# -------------------------------------------
# Prediction Page
# -------------------------------------------
elif options == "Prediction":

    st.subheader("üîÆ Predict Rainfall")

    # Model Selection
    model_name = st.selectbox(
        "Choose Model",
        ["RandomForest", "LinearRegression", "XGBoost"]
    )

    # Load selected model
    model, feature_names = joblib.load(f"{model_name}.pkl")

    st.write(f"### ‚úî Loaded model: **{model_name}**")

    # Input Fields
    input_data = {}

    st.write("### üîß Enter Feature Values")

    for feature in feature_names:
        value = st.number_input(f"{feature}", value=0.0)
        input_data[feature] = value

    input_df = pd.DataFrame([input_data])

    # Predict
    if st.button("Predict Rainfall"):
        prediction = model.predict(input_df)[0]
        st.success(f"üåßÔ∏è Predicted Rainfall: **{prediction:.2f} mm**")

    # Feature Importance - only for RF & XGB
    if st.checkbox("Show Feature Importance (RF & XGB only)"):
        if model_name in ["RandomForest", "XGBoost"]:
            importances = model.feature_importances_
            fig, ax = plt.subplots(figsize=(8, 6))
            sns.barplot(x=feature_names, y=importances)
            plt.xticks(rotation=90)
            st.pyplot(fig)
        else:
            st.info("Feature importance not available for Linear Regression.")
```

---

# üìå **3. requirements.txt**

```
streamlit
pandas
scikit-learn
xgboost
joblib
seaborn
matplotlib
```

---

# ‚ñ∂Ô∏è **How to Run the App**

### **Step 1 ‚Äî Install packages**

```
pip install -r requirements.txt
```

### **Step 2 ‚Äî Train all models**

```
python train_model.py
```

### **Step 3 ‚Äî Start Streamlit app**

```
streamlit run app.py
```

---

# üéâ DONE!

Your app now includes:

### ‚úÖ Full EDA Dashboard

### ‚úÖ Interactive visualizations

### ‚úÖ Model selection

### ‚úÖ Feature importance

### ‚úÖ Prediction interface

### ‚úÖ XGBoost + RandomForest + Linear Regression

---

If you want:

üöÄ Add model comparison
üåê Deploy on Streamlit Cloud
üìä Add time-series graphs
üìÅ Add file upload option
üéõ Advanced preprocessing (scaling, encoding)

Just tell me ‚Äî I can upgrade!
