In [None]:
# Exploratory Data Analysis (EDA) and Model Building for Salary Prediction

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
import joblib
import os

# Load data
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
DATA_PATH = os.path.join(BASE_DIR, 'data', 'expected_ctc.csv')
df = pd.read_csv(DATA_PATH)

# EDA
print("Data Shape:", df.shape)
print("\nData Sample:")
print(df.head())
print("\nMissing Values:")
print(df.isnull().sum())

# Visualize distributions
plt.figure(figsize=(10, 6))
sns.histplot(df['Expected_CTC'], bins=30, kde=True)
plt.title('Distribution of Expected CTC')
plt.xlabel('Expected CTC')
plt.ylabel('Frequency')
plt.show()

# Correlation heatmap for numerical features
num_features = ['Total_Experience', 'No_Of_Companies_worked', 'Current_CTC', 'Expected_CTC']
plt.figure(figsize=(8, 6))
sns.heatmap(df[num_features].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Feature selection
features = [
    'Industry',
    'Department',
    'Role',
    'Education',
    'Total_Experience',
    'No_Of_Companies_worked',
    'Current_CTC'
]
target = 'Expected_CTC'

# Split data
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
categorical_features = ['Industry', 'Department', 'Role', 'Education']
numerical_features = ['Total_Experience', 'No_Of_Companies_worked', 'Current_CTC']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numerical_features)
])

# Model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train model
model.fit(X_train, y_train)
print("Model training complete.")

# Evaluate on test set
score = model.score(X_test, y_test)
print(f"Test R^2 Score: {score:.4f}")

# Save model
MODEL_PATH = os.path.join(BASE_DIR, 'models', 'model.joblib')
joblib.dump(model, MODEL_PATH)
print(f"Model saved to {MODEL_PATH}")
