In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('Salary Data.csv')
display(data.head())
display(data.shape)

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


(375, 6)

In [3]:
data.rename(columns={'Education Level': 'Qualification'}, inplace=True)
print("Missing values before dropping:")
print(data.isnull().sum())
data.dropna(inplace=True)
print("\nMissing values after dropping:")
print(data.isnull().sum())
print("\nShape after dropping missing values:")
print(data.shape)

print("\nDuplicate rows before dropping:")
print(data.duplicated().sum())
data.drop_duplicates(inplace=True)
print("\nDuplicate rows after dropping:")
print(data.duplicated().sum())
print("\nShape after dropping duplicates:")
print(data.shape)

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['Gender'] = encoder.fit_transform(data['Gender'])
data['Qualification'] = encoder.fit_transform(data['Qualification'])
data['Job Title'] = encoder.fit_transform(data['Job Title'])

display(data.head())

Missing values before dropping:
Age                    2
Gender                 2
Qualification          2
Job Title              2
Years of Experience    2
Salary                 2
dtype: int64

Missing values after dropping:
Age                    0
Gender                 0
Qualification          0
Job Title              0
Years of Experience    0
Salary                 0
dtype: int64

Shape after dropping missing values:
(373, 6)

Duplicate rows before dropping:
49

Duplicate rows after dropping:
0

Shape after dropping duplicates:
(324, 6)


Unnamed: 0,Age,Gender,Qualification,Job Title,Years of Experience,Salary
0,32.0,1,0,159,5.0,90000.0
1,28.0,0,1,17,3.0,65000.0
2,45.0,1,2,130,15.0,150000.0
3,36.0,0,0,101,7.0,60000.0
4,52.0,1,1,22,20.0,200000.0


In [4]:
X = data.drop(columns=['Salary'])
y = data['Salary']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (259, 5)
Shape of X_test: (65, 5)
Shape of y_train: (259,)
Shape of y_test: (65,)


In [5]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

models = {
    "LinearRegression": LinearRegression(),
    "RandomForestRegressor": RandomForestRegressor(random_state=42),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"{name} trained successfully.")

LinearRegression trained successfully.
RandomForestRegressor trained successfully.
GradientBoostingRegressor trained successfully.


In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    results[name] = {'MAE': mae, 'MSE': mse}
    print(f"{name}: MAE = {mae:.4f}, MSE = {mse:.4f}")

LinearRegression: MAE = 10570.7864, MSE = 205754135.7173
RandomForestRegressor: MAE = 9317.2308, MSE = 160898749.1438
GradientBoostingRegressor: MAE = 9534.1216, MSE = 156350792.5000


In [7]:
best_model_name_mae = min(results, key=lambda k: results[k]['MAE'])
best_model_mae = models[best_model_name_mae]

import joblib
joblib.dump(best_model_mae, "model.pkl")

print(f"Best model based on MAE: {best_model_name_mae}")
print("\nBest model saved as model.pkl")

Best model based on MAE: RandomForestRegressor

Best model saved as model.pkl


In [8]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.47.0-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.47.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m92.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInst

In [42]:
import streamlit as st
import pandas as pd
import joblib

# Save model_columns
joblib.dump(X_train.columns.tolist(), "model_columns.pkl")

# Save test_predictions
joblib.dump((X_test, y_test, y_pred), "test_predictions.pkl")

# Define the Streamlit application code
app_code = """
import streamlit as st
import pandas as pd
import joblib
import matplotlib.pyplot as plt

# Load assets
model = joblib.load("model.pkl")
model_columns = joblib.load("model_columns.pkl")
X_test, y_test, y_pred = joblib.load("test_predictions.pkl")
original_data = pd.read_csv("Salary Data.csv")
original_data.dropna(inplace=True)

# --- Streamlit UI ---

st.set_page_config(page_title="Salary Predictor", page_icon="💸", layout="centered")
st.markdown("<h1 style='text-align: center; color: #FF4B4B;'>Employee Salary Predictor</h1>", unsafe_allow_html=True)
st.markdown("<p style='text-align: center; color: gray;'>Predict your salary with AI.</p><hr>", unsafe_allow_html=True)


st.markdown(\"""
    <style>
     body {
        background-color: #f4f6f9;
    }
    .main {
        border-radius: 15px;
        padding: 2rem;
        box-shadow: 0 10px 30px rgba(0,0,0,0.1);
    }
    .css-1aumxhk {
        font-size: 1.1rem;
    }
    </style>
\""", unsafe_allow_html=True)

st.markdown("<h2 style='text-align: center;'>🧾 Enter Employee Details</h2>", unsafe_allow_html=True)


with st.form("predict_form"):
    col1, col2 = st.columns(2)

    with col1:
        age = st.slider("🎂 Age", 18, 65, 30)
        gender = st.selectbox("👤 Gender", ["Male", "Female"])
        education = st.selectbox("🎓 Qualification", original_data['Education Level'].unique())

    with col2:
        job = st.selectbox("💼 Job Role", original_data['Job Title'].unique())
        exp = st.slider("⏳ Years of Experience", 0, 40, 3)

    submitted = st.form_submit_button("✨ Predict Salary")

    if submitted:
        # Construct input data
        input_dict = {col: 0 for col in model_columns}
        input_dict["Age"] = age
        input_dict["Years of Experience"] = exp

        # Encoding inputs
        gender_encoded = original_data['Gender'].unique().tolist().index(gender)
        qualification_encoded = original_data['Education Level'].unique().tolist().index(education)
        job_title_encoded = original_data['Job Title'].unique().tolist().index(job)

        input_dict["Gender"] = gender_encoded
        input_dict["Qualification"] = qualification_encoded
        input_dict["Job Title"] = job_title_encoded

        X_input = pd.DataFrame([input_dict], columns=model_columns)

        # Validations
        if exp >= age or exp > (age - 20) or age < (exp + 18):
            st.error("Invalid combination of age and experience!")
            st.stop()
        if education == "Master's" and age < 23:
            st.error("Age & Qualification doesn't match! Please adjust.")
            st.stop()
        if education == "PhD" and age < 26:
            st.error("Age & Qualification doesn't match! Please adjust.")
            st.stop()

        # Prediction
        salary = model.predict(X_input)[0]
        st.success(f"🎉 Great news! Your predicted monthly salary is : *₹ {salary:,.2f}*")
        st.info("💡 Keep in mind, This prediction is based on available data and does not guarantee actual compensation.")

        # Save the report in session state
        report_text = f'''
        📄 Salary Prediction Report
        ----------------------------
        👤 Gender: {gender}
        🎂 Age: {age}
        🎓 Qualification: {education}
        💼 Job Role: {job}
        ⏳ Experience: {exp} years

        💸 Predicted Monthly Salary: ₹ {salary:,.2f}

        Disclaimer:
        This prediction is based on available data and does not guarantee actual compensation.
        '''
        st.session_state['report_text'] = report_text
        st.session_state['show_download'] = True


# Display download button after form is submitted
if st.session_state.get('show_download'):
    st.download_button(
        label="📥 Download Report",
        data=st.session_state['report_text'],
        file_name="salary_prediction_report.txt",
        mime="text/plain"
    )


# Feature importance
if hasattr(model, 'coef_'): # Check if model has coef_ attribute (for linear models)
    with st.expander("🧠 Feature Importance"):
        coef_df = pd.DataFrame({"Feature": model_columns, "Coefficient": model.coef_})
        coef_df = coef_df.sort_values(by="Coefficient", key=abs, ascending=False)
        st.bar_chart(coef_df.set_index("Feature"))
elif hasattr(model, 'feature_importances_'): # Check if model has feature_importances_ attribute (for tree-based models)
     with st.expander("🧠 Feature Importance"):
        importance_df = pd.DataFrame({"Feature": model_columns, "Importance": model.feature_importances_})
        importance_df = importance_df.sort_values(by="Importance", ascending=False)
        st.bar_chart(importance_df.set_index("Feature"))


# Scatter plot
with st.expander("📈 Scatter Plot: Actual vs Predicted Salary"):
    fig, ax = plt.subplots()
    ax.scatter(y_test, y_pred, alpha=0.5, color='blue')
    ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--')
    ax.set_xlabel("Actual Salary")
    ax.set_ylabel("Predicted Salary")
    ax.set_title("Actual vs Predicted Salary")
    st.pyplot(fig)
"""

with open("app.py", "w") as f:
    f.write(app_code)

print("app.py created successfully.")

app.py created successfully.


In [15]:
!pip install ngrok




In [19]:
!ngrok authtoken 306b3AXFLq21KM8RtHVecDV32xb_7m9NGqPs15rrrtGjjdAGr

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [43]:
!pip install streamlit pyngrok --quiet

from pyngrok import ngrok

# Start the tunnel (v3 compatible)
public_url = ngrok.connect("http://localhost:8501")
print("Streamlit app running at:", public_url)

# Start Streamlit app
!streamlit run app.py &


Streamlit app running at: NgrokTunnel: "https://f500a98258c1.ngrok-free.app" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://104.154.104.90:8501[0m
[0m
[34m  Stopping...[0m
