In [None]:
import pandas as pd

data = pd.read_csv('employee_data.csv')
print(data.head())
print(data.shape)


In [None]:
print(data.info())
print(data.describe())
print(data.columns)

# Visualize target balance (example: 'promoted')
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='promoted', data=data)
plt.title("Target Class Distribution")
plt.show()


In [None]:
print(data.isnull().sum())

# Option 1: Drop missing rows (if few)
data.dropna(inplace=True)

# Option 2: Fill missing values
# Example:
data['training_score'].fillna(data['training_score'].mean(), inplace=True)
data['department'].fillna(data['department'].mode()[0], inplace=True)


In [None]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()

# Label encoding (for binary categorical features)
data['education_level'] = label_enc.fit_transform(data['education_level'])

# One-Hot encoding (for multi-class features)
data = pd.get_dummies(data, columns=['department', 'job_role'], drop_first=True)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_features = ['salary', 'training_score', 'experience']

data[num_features] = scaler.fit_transform(data[num_features])


In [None]:
from sklearn.model_selection import train_test_split

X = data.drop('promoted', axis=1)  # Replace 'promoted' with your actual target
y = data['promoted']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# SVM
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    "Logistic Regression": y_pred_log,
    "Decision Tree": y_pred_dt,
    "Random Forest": y_pred_rf,
    "SVM": y_pred_svm,
    "KNN": y_pred_knn
}

for name, y_pred in models.items():
    print(f"\n{name} Evaluation:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print(f"Precision: {precision_score(y_test, y_pred):.2f}")
    print(f"Recall: {recall_score(y_test, y_pred):.2f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.2f}")
    print("-" * 40)


In [None]:
from sklearn.model_selection import GridSearchCV

# Random Forest example
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train, y_train)

print("Best Params:", grid_rf.best_params_)
best_rf_model = grid_rf.best_estimator_


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)

# Split again using selected features
X_train_sel, X_test_sel, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [None]:
import joblib

# Save the best model
joblib.dump(best_rf_model, 'best_model.pkl')

# To load later:
# loaded_model = joblib.load('best_model.pkl')
