In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
!pip install vaderSentiment --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
import joblib
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [6]:
train_df = pd.read_csv('/kaggle/input/employee-dataset/train_with_feedback.csv')
test_df = pd.read_csv('/kaggle/input/employee-dataset/test_with_feedback.csv')

In [7]:
print(train_df.head())
print(train_df.info())

   Employee ID  Age  Gender  Years at Company    Job Role  Monthly Income  \
0         8410   31    Male                19   Education            5390   
1        64756   59  Female                 4       Media            5534   
2        30257   24  Female                10  Healthcare            8159   
3        65791   36  Female                 7   Education            3989   
4        65026   56    Male                41   Education            4821   

  Work-Life Balance Job Satisfaction Performance Rating  Number of Promotions  \
0         Excellent           Medium            Average                     2   
1              Poor             High                Low                     3   
2              Good             High                Low                     0   
3              Good             High               High                     1   
4              Fair        Very High            Average                     0   

   ... Job Level  Company Size Company Tenure Remo

# Preprocessing & Feature Engineering

In [8]:
print(train_df.columns.tolist())

['Employee ID', 'Age', 'Gender', 'Years at Company', 'Job Role', 'Monthly Income', 'Work-Life Balance', 'Job Satisfaction', 'Performance Rating', 'Number of Promotions', 'Overtime', 'Distance from Home', 'Education Level', 'Marital Status', 'Number of Dependents', 'Job Level', 'Company Size', 'Company Tenure', 'Remote Work', 'Leadership Opportunities', 'Innovation Opportunities', 'Company Reputation', 'Employee Recognition', 'Attrition', 'Feedback']


In [9]:
for col in train_df.columns:
    if train_df[col].dtype in ['int64', 'float64']:
        median_val = train_df[col].median()
        train_df[col] = train_df[col].fillna(median_val)
        test_df[col] = test_df[col].fillna(median_val)
    else:
        mode_val = train_df[col].mode()[0]
        train_df[col] = train_df[col].fillna(mode_val)
        test_df[col] = test_df[col].fillna(mode_val)

In [10]:
cat_cols = ['Gender', 'Job Role', 'Marital Status', 'Overtime', 'Remote Work']

le_dict = {}

for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    le_dict[col] = le 

# Attrition Score Analysis on Feedback Column

In [11]:
for df in [train_df, test_df]:
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].astype('category')

In [12]:
print(train_df['Attrition'].unique())

['Stayed', 'Left']
Categories (2, object): ['Left', 'Stayed']


In [13]:
train_df['Attrition'] = train_df['Attrition'].str.strip().str.lower()
y = train_df['Attrition'].map({'stayed': 0, 'left': 1})
X = train_df.drop(columns=['Attrition'])

In [14]:
print("Non-numeric columns after conversion:")
print(X.select_dtypes(exclude=['int', 'float', 'bool', 'category']).columns.tolist())

Non-numeric columns after conversion:
[]


In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    enable_categorical=True,
    random_state=42,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
)

model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_val)
y_proba = model.predict_proba(X_val)[:, 1]

print("Classification Report:")
print(classification_report(y_val, y_pred))

print("ROC AUC Score:", roc_auc_score(y_val, y_proba))

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      6253
           1       0.94      0.93      0.93      5667

    accuracy                           0.94     11920
   macro avg       0.94      0.94      0.94     11920
weighted avg       0.94      0.94      0.94     11920

ROC AUC Score: 0.9899890367781398


In [19]:
joblib.dump(model, 'attrition_xgb_model.pkl')
print("Model saved as attrition_xgb_model.pkl")

Model saved as attrition_xgb_model.pkl


# VADER Sentiment Analysis on Feedback Column

In [21]:
analyzer = SentimentIntensityAnalyzer()

In [23]:
def get_sentiment_score(text):
    if pd.isna(text) or str(text).strip() == '':
        return 0.0
    score = analyzer.polarity_scores(str(text))
    return score['compound']

In [25]:
def get_sentiment_label(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [26]:
train_df['sentiment_score'] = train_df['Feedback'].apply(get_sentiment_score)
train_df['sentiment_label'] = train_df['sentiment_score'].apply(get_sentiment_label)

test_df['sentiment_score'] = test_df['Feedback'].apply(get_sentiment_score)
test_df['sentiment_label'] = test_df['sentiment_score'].apply(get_sentiment_label)

In [27]:
def sentiment_pipeline(text):
    if pd.isna(text) or str(text).strip() == '':
        return 0.0
    score = SentimentIntensityAnalyzer().polarity_scores(str(text))
    return score['compound']

In [29]:
sample_feedbacks = [
    "I love working here, the team is amazing!",
    "The management is terrible and I feel undervalued.",
    "It's okay, some days are better than others.",
    "",
    None
]

for feedback in sample_feedbacks:
    score = sentiment_pipeline(feedback)
    print(f"Feedback: {feedback}\nSentiment Score: {score:.3f}\n")

Feedback: I love working here, the team is amazing!
Sentiment Score: 0.852

Feedback: The management is terrible and I feel undervalued.
Sentiment Score: -0.477

Feedback: It's okay, some days are better than others.
Sentiment Score: 0.586

Feedback: 
Sentiment Score: 0.000

Feedback: None
Sentiment Score: 0.000



In [30]:
joblib.dump(sentiment_pipeline, 'sentiment_pipeline.pkl')
print("Saved 'sentiment_pipeline.pkl' successfully.")

Saved 'sentiment_pipeline.pkl' successfully.
