## Cell 1: Salary Prediction Project Introduction

In [None]:
# Salary Prediction Project: EDA and Model Building

'''This notebook covers exploratory data analysis, feature engineering, model training, and evaluation for predicting employee salary (CTC) based on profile and experience.'''


## Cell 2: Import Required Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import os

## Cell 3: Load Dataset

In [None]:
# Load data
import os
BASE_DIR = os.path.dirname(os.getcwd())
DATA_PATH = os.path.join(BASE_DIR, 'data', 'expected_ctc.csv')
df = pd.read_csv(DATA_PATH)
df.head()

## Cell 4: Data Overview and Missing Value Check

In [None]:
# Data overview
print('Data Shape:', df.shape)
print('\nData Sample:')
print(df.head())
print('\nMissing Values:')
print(df.isnull().sum())

## Cell 5: Visualize Expected CTC distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Expected_CTC'], bins=30, kde=True)
plt.title('Distribution of Expected CTC')
plt.xlabel('Expected CTC')
plt.ylabel('Frequency')
plt.show()

## Cell 6: Correlation Heatmap for Numerical Features

In [None]:
num_features = ['Total_Experience', 'No_Of_Companies_worked', 'Current_CTC', 'Expected_CTC']
plt.figure(figsize=(8, 6))
sns.heatmap(df[num_features].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## Cell 7: Feature Selection and Train-Test Split

In [None]:
# Feature selection and train-test split
features = [
    'Industry',
    'Department',
    'Role',
    'Education',
    'Total_Experience',
    'No_Of_Companies_worked',
    'Current_CTC'
]
target = 'Expected_CTC'

X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Cell 8: Preprocessing and Pipeline Setup

In [None]:
categorical_features = ['Industry', 'Department', 'Role', 'Education']
numerical_features = ['Total_Experience', 'No_Of_Companies_worked', 'Current_CTC']

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numerical_features)
])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

## Cell 10: Model Training and Evaluation

In [None]:
# Model training and evaluation
model.fit(X_train, y_train)
print("Model training complete.")
score = model.score(X_test, y_test)
print(f"Test R^2 Score: {score:.4f}")

## Summary and Next Steps

- Explored and visualized the salary dataset
- Built and evaluated a regression model for salary prediction
- Next: Tune model, try other algorithms, or deploy for HR use