In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from spmf import Spmf
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:

# Load Dataset
df = pd.read_csv("IBM-HR-Employee-Attrition.csv")

# Data Preprocessing
# Encode categorical variables as integers
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Check for missing values
if df.isnull().sum().sum() > 0:
    df.fillna(method='ffill', inplace=True)  # Simple forward-fill

# Transform Dataset to Sequential Format
def transform_to_sequence(df, sequence_columns):
    sequences = []
    for _, row in df[sequence_columns].iterrows():
        sequences.append(' '.join(map(str, row)) + " -2")  # End sequence with -2
    return sequences

sequence_columns = ['Age', 'DistanceFromHome', 'Education', 'JobLevel', 'MonthlyIncome', 'OverTime']
sequences = transform_to_sequence(df, sequence_columns)

# Save sequences to file
with open('sequences.txt', 'w') as f:
    for sequence in sequences:
        f.write(sequence + '\n')

# Sequential Pattern Mining using SPMF
spmf = Spmf("SPADE", input_filename="sequences.txt", output_filename="patterns.txt",
            arguments=[0.5])  # Minimum support = 50%
spmf.run()

# Read and process mined patterns
with open('patterns.txt', 'r') as f:
    patterns = f.readlines()
patterns = [p.strip() for p in patterns if p.strip()]

# Transform Patterns into Features for Classification
# Example: Binary features indicating the presence of each pattern
def create_features_from_patterns(patterns, sequences):
    features = []
    for sequence in sequences:
        row = []
        for pattern in patterns:
            row.append(1 if pattern in sequence else 0)
        features.append(row)
    return np.array(features)

features = create_features_from_patterns(patterns, sequences)

# Classification
X = features
y = df['Attrition']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate Model
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

>/D:/NIIT Documents/Sem VII/Predictive Modelling/employee-attrition-prediction/spmf.jar
 Total time ~ 11 ms
 Frequent sequences count : 3
 Join count : 3
 Max memory (mb):10.782730102539062
Content at file patterns.txt


              precision    recall  f1-score   support

           0       0.87      1.00      0.93       255
           1       0.00      0.00      0.00        39

    accuracy                           0.87       294
   macro avg       0.43      0.50      0.46       294
weighted avg       0.75      0.87      0.81       294



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
df = pd.read_csv("IBM-HR-Employee-Attrition.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [3]:
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

In [22]:
df['Sequence'] = df.apply(lambda row: [row['Attrition'], row['YearsAtCompany'], row['EnvironmentSatisfaction'], row['DailyRate']], axis=1)


In [23]:
with open("spmf_input.txt", "w") as file:
    for sequence in df['Sequence']:
        file.write(" -1 ".join(map(str, sequence)) + " -2\n")

# Apply the TKS algorithm
spmf = Spmf("TKS", arguments=[10], input_filename="spmf_input.txt", output_filename="output.txt", spmf_bin_location_dir="D:\\NIIT Documents\\Sem VII\\Predictive Modelling\\employee-attrition-prediction")
spmf.run()
print(spmf.to_pandas_dataframe())

>/D:/NIIT Documents/Sem VII/Predictive Modelling/employee-attrition-prediction/spmf.jar
Minsup after preprocessing : 80
Max candidates: 13 Candidates explored  : 20
Pattern found count : 10
Time preprocessing: 34 ms 
Total time: 56 ms 
Max memory (mb) : 13.762130737304688
Final minsup value: 196
Intersection count 0 


  pattern   sup
0     [5]   196
1  [0, 1]   310
2     [2]   392
3  [0, 3]   474
4  [0, 2]   331
5     [1]   544
6  [0, 4]   447
7     [0]  1249
8     [4]   519
9     [3]   547
