#  a. Understanding the dataset

In [1]:
import pandas as pd
# Import the dataset in a notebook environment with python library Pandas
dataset = pd.read_csv('input.csv')

In [4]:
# showing the number of attributes (columns) and number of records (rows)
dataset.shape   

(1470, 35)

In [5]:
# showing the statistics of the dataset
dataset.describe()  

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [6]:
# Count the number of missing values in the dataset
dataset.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [7]:
#  Count the number of duplicate values in the dataset.
dataset.duplicated().sum()

0

 b. Data cleaning

In [4]:
# If you find any missing values in the dataset, replace those data with the column wise mean.
if dataset.isna().sum().sum() > 0:
    dataset.fillna(dataset.mean(), inplace=True)

In [8]:
# f you find any duplicates in the dataset, keep just one copy of the data.
if dataset.duplicated().sum() > 0:
    dataset.drop_duplicates(inplace=True)

In [9]:
# drop rows with missing target values
dataset.dropna(subset=['Attrition'], inplace=True)

 c. Creation of input and output features

In [9]:
# create input and output data
features = dataset.drop('Attrition', axis=1)
labels = dataset['Attrition']

 d. Conversion of features into numeric values

In [10]:
# List of categorical columns for one-hot encoding
categorical_columns = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus']

# Perform one-hot encoding on the DataFrame
features = pd.get_dummies(features, columns=categorical_columns)

In [11]:
from sklearn.preprocessing import LabelEncoder

# Perform binary categorical encoding
binary_categorical_columns = ['Gender', 'OverTime', 'Over18']

for column in binary_categorical_columns:
    encoder = LabelEncoder()
    features[column] = encoder.fit_transform(features[column])


In [7]:
# label encode the target column
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

e. Scaling of the features

In [12]:
# Compute minmax scaling and standard scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def scale_features(features, method='minmax'):
    if method == 'minmax':
        scaler = MinMaxScaler()
    elif method == 'standard':
        scaler = StandardScaler()
    else:
        raise ValueError("Invalid method. Choose 'minmax' or 'standard'.")

    scaled_features = scaler.fit_transform(features)
    scaled_features_df = pd.DataFrame(scaled_features, columns=features.columns)

    return scaled_features_df

features_minmax = scale_features(features, method='minmax')
features_standard = scale_features(features, method='standard')

In [16]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Function to evaluate model performance with different scaled features
def evaluate_model(features, labels):
    # Step 1: Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
    
    # Step 2: Initialize the Logistic Regression classifier
    clf = LogisticRegression()
    
    # Step 3: Train the classifier on the training data
    clf.fit(X_train, y_train)
    
    # Step 4: Make predictions on the test set
    y_pred = clf.predict(X_test)
    
    # Step 5: Evaluate the classifier's performance
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Evaluate performance with Min-Max Scaling
accuracy_minmax = evaluate_model(features_minmax, labels)
print(f"Accuracy with Min-Max Scaling: {accuracy_minmax:.2f}")

# Evaluate performance with Standard Scaling
accuracy_standard = evaluate_model(features_standard, labels)
print(f"Accuracy with Standard Scaling: {accuracy_standard:.2f}")

Accuracy with Min-Max Scaling: 0.89
Accuracy with Standard Scaling: 0.88


f. Correlation Analysis

In [17]:
from sklearn.preprocessing import LabelEncoder

# Assuming 'features' is your DataFrame with scaled features and 'labels' is your target variable
features_df = pd.DataFrame(features_minmax, columns=features.columns)

# Create a DataFrame for the target variable
target_df = pd.DataFrame(labels, columns=['Attrition'])

# Extract the target series
target_series = target_df['Attrition']

# Calculate correlation between features and the target
correlations = features_df.corrwith(target_series)
print(correlations)

Age                                 -0.159205
DailyRate                           -0.056652
DistanceFromHome                     0.077924
Education                           -0.031373
EmployeeCount                             NaN
EmployeeNumber                      -0.010577
EnvironmentSatisfaction             -0.103369
Gender                               0.029453
HourlyRate                          -0.006846
JobInvolvement                      -0.130016
JobLevel                            -0.169105
JobSatisfaction                     -0.103481
MonthlyIncome                       -0.159840
MonthlyRate                          0.015170
NumCompaniesWorked                   0.043494
Over18                                    NaN
OverTime                             0.246118
PercentSalaryHike                   -0.013478
PerformanceRating                    0.002889
RelationshipSatisfaction            -0.045872
StandardHours                             NaN
StockOptionLevel                  

  c /= stddev[:, None]
  c /= stddev[None, :]
