In [None]:
pip install scikit-learn


In [None]:
import pandas as pd
df = pd.read_csv('employee_attrition.csv')


In [None]:
from sklearn.datasets import fetch_openml
df = fetch_openml(name='employee-attrition', version=1, as_frame=True).frame


In [None]:
print(df.head())
print(df.info())           # Check data types and non-null counts
print(df.isnull().sum())   # Check missing values per column
print(df.describe())       # Summary statistics for numeric columns


In [None]:
from sklearn.impute import SimpleImputer
import numpy as np

# Example: Impute age, salary, experience columns with mean
imputer = SimpleImputer(strategy='mean')
df[['Age', 'Salary', 'Experience']] = imputer.fit_transform(df[['Age', 'Salary', 'Experience']])


In [None]:
# Example using One-Hot Encoding for 'Department' and 'Gender'
df = pd.get_dummies(df, columns=['Department', 'Gender'], drop_first=True)


In [None]:
Q1 = df[['Salary', 'Experience']].quantile(0.25)
Q3 = df[['Salary', 'Experience']].quantile(0.75)
IQR = Q3 - Q1

condition = ~((df[['Salary', 'Experience']] < (Q1 - 1.5 * IQR)) | (df[['Salary', 'Experience']] > (Q3 + 1.5 * IQR))).any(axis=1)
df_cleaned = df[condition]


In [None]:
X = df_cleaned.drop('Attrition', axis=1)  # assuming 'Attrition' is the target column
y = df_cleaned['Attrition']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training Set Size:", X_train.shape)
print("Testing Set Size:", X_test.shape)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler_standard = StandardScaler()
numeric_cols = ['Age', 'Salary', 'Experience']

X_train[numeric_cols] = scaler_standard.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler_standard.transform(X_test[numeric_cols])


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler_minmax = MinMaxScaler()
X_train[numeric_cols] = scaler_minmax.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler_minmax.transform(X_test[numeric_cols])


In [None]:
print(X_train.head())
print(y_train.head())


In [None]:
processed_df = X_train.copy()
processed_df['Attrition'] = y_train

processed_df.to_csv('employee_attrition_preprocessed.csv', index=False)
