In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
!pip install lazypredict
df = pd.read_csv('/content/HRDataset_v14.csv')

Collecting lazypredict
  Downloading lazypredict-0.2.16-py2.py3-none-any.whl.metadata (13 kB)
Collecting pytest-runner (from lazypredict)
  Downloading pytest_runner-6.0.1-py3-none-any.whl.metadata (7.3 kB)
Collecting mlflow>=2.0.0 (from lazypredict)
  Downloading mlflow-2.21.3-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.21.3 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_skinny-2.21.3-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow>=2.0.0->lazypredict)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow>=2.0.0->lazypredict)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow>=2.0.0->lazypredict)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow>=2.0.0->lazypredict)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-ski

In [None]:
def get_df_info(df):
    print("\n\033[1mShape of DataFrame:\033[0m ", df.shape)
    print("\n\033[1mColumns in DataFrame:\033[0m ", df.columns.to_list())
    print("\n\033[1mData types of columns:\033[0m\n", df.dtypes)

    print("\n\033[1mInformation about DataFrame:\033[0m")
    df.info()

    print("\n\033[1mNumber of unique values in each column:\033[0m")
    for col in df.columns:
        print(f"\033[1m{col}\033[0m: {df[col].nunique()}")

    print("\n\033[1mNull values in columns:\033[0m")
    null_counts = df.isnull().sum()
    null_columns = null_counts[null_counts > 0]
    if len(null_columns) > 0:
        for col, count in null_columns.items():
            print(f"\033[1m{col}\033[0m: {count}")
    else:
        print("There are no null values in the DataFrame.")

    print("\n\033[1mNumber of duplicate rows:\033[0m ", df.duplicated().sum())

    print("\n\033[1mDescriptive statistics of DataFrame:\033[0m\n",)
    return df.describe().transpose()

# Call the function
get_df_info(df)



[1mShape of DataFrame:[0m  (311, 36)

[1mColumns in DataFrame:[0m  ['Employee_Name', 'EmpID', 'MarriedID', 'MaritalStatusID', 'GenderID', 'EmpStatusID', 'DeptID', 'PerfScoreID', 'FromDiversityJobFairID', 'Salary', 'Termd', 'PositionID', 'Position', 'State', 'Zip', 'DOB', 'Sex', 'MaritalDesc', 'CitizenDesc', 'HispanicLatino', 'RaceDesc', 'DateofHire', 'DateofTermination', 'TermReason', 'EmploymentStatus', 'Department', 'ManagerName', 'ManagerID', 'RecruitmentSource', 'PerformanceScore', 'EngagementSurvey', 'EmpSatisfaction', 'SpecialProjectsCount', 'LastPerformanceReview_Date', 'DaysLateLast30', 'Absences']

[1mData types of columns:[0m
 Employee_Name                  object
EmpID                           int64
MarriedID                       int64
MaritalStatusID                 int64
GenderID                        int64
EmpStatusID                     int64
DeptID                          int64
PerfScoreID                     int64
FromDiversityJobFairID          int64
Salary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
EmpID,311.0,10156.0,89.922189,10001.0,10078.5,10156.0,10233.5,10311.0
MarriedID,311.0,0.398714,0.490423,0.0,0.0,0.0,1.0,1.0
MaritalStatusID,311.0,0.810289,0.943239,0.0,0.0,1.0,1.0,4.0
GenderID,311.0,0.434084,0.496435,0.0,0.0,0.0,1.0,1.0
EmpStatusID,311.0,2.392283,1.794383,1.0,1.0,1.0,5.0,5.0
DeptID,311.0,4.610932,1.083487,1.0,5.0,5.0,5.0,6.0
PerfScoreID,311.0,2.977492,0.587072,1.0,3.0,3.0,3.0,4.0
FromDiversityJobFairID,311.0,0.093248,0.291248,0.0,0.0,0.0,0.0,1.0
Salary,311.0,69020.684887,25156.63693,45046.0,55501.5,62810.0,72036.0,250000.0
Termd,311.0,0.334405,0.472542,0.0,0.0,0.0,1.0,1.0


In [None]:
from datetime import datetime


# Step 1: Convert the 'DOB' column to datetime
def parse_date(date_str):
    try:
        return pd.to_datetime(date_str, format='%m/%d/%y')
    except:
        try:
            return pd.to_datetime(date_str, format='%m-%d-%Y')
        except:
            return pd.NaT  # Return Not a Time for any unparseable dates

df['DOB'] = df['DOB'].apply(parse_date)

# Step 2: Calculate age as of 2021
reference_date = pd.Timestamp('2021-12-31')  # You can adjust this date if needed

def calculate_age(born):
    if pd.isnull(born):
        return None
    return reference_date.year - born.year - ((reference_date.month, reference_date.day) < (born.month, born.day))

df['Age'] = df['DOB'].apply(calculate_age)

# Step 3: Handle any potential negative ages
df.loc[df['Age'] < 0, 'Age'] = 0

# Print the first few rows to verify
print(df[['DOB', 'Age']].head())

         DOB  Age
0 1983-07-10   38
1 1975-05-05   46
2 1988-09-19   33
3 1988-09-27   33
4 1989-09-08   32


In [None]:
df['Age'].value_counts()

Unnamed: 0_level_0,count
Age,Unnamed: 1_level_1
0,42
35,21
38,21
42,19
34,19
33,18
37,16
40,14
32,14
36,13


In [None]:
def parse_date(date_str):
    try:
        return pd.to_datetime(date_str, format='%m/%d/%y')
    except:
        try:
            return pd.to_datetime(date_str, format='%m-%d-%Y')
        except:
            print(f"Failed to parse date: {date_str}")  # Debug print
            return pd.NaT

df['DOB'] = df['DOB'].apply(parse_date)

reference_date = pd.Timestamp('2021-12-31')

def calculate_age(born):
    if pd.isnull(born):
        print(f"Null birthdate encountered")  # Debug print
        return None
    age = reference_date.year - born.year - ((reference_date.month, reference_date.day) < (born.month, born.day))
    if age < 0:
        print(f"Negative age encountered: {born}")  # Debug print
    return age

df['Age'] = df['DOB'].apply(calculate_age)

df.loc[df['Age'] < 0, 'Age'] = 0

# Print summary statistics
print(df['Age'].value_counts())
print(f"Number of zero ages: {(df['Age'] == 0).sum()}")

# Print problematic entries
print("\nEntries with age 0:")
print(df[df['Age'] == 0][['DOB', 'Age']])


Negative age encountered: 2066-04-17 00:00:00
Negative age encountered: 2058-12-27 00:00:00
Negative age encountered: 2067-01-16 00:00:00
Negative age encountered: 2064-07-30 00:00:00
Negative age encountered: 2064-06-01 00:00:00
Negative age encountered: 2066-11-22 00:00:00
Negative age encountered: 2063-05-15 00:00:00
Negative age encountered: 2051-01-02 00:00:00
Negative age encountered: 2065-09-09 00:00:00
Negative age encountered: 2052-01-18 00:00:00
Negative age encountered: 2051-02-25 00:00:00
Negative age encountered: 2067-04-19 00:00:00
Negative age encountered: 2055-04-14 00:00:00
Negative age encountered: 2063-08-28 00:00:00
Negative age encountered: 2068-07-06 00:00:00
Negative age encountered: 2064-10-12 00:00:00
Negative age encountered: 2052-08-18 00:00:00
Negative age encountered: 2066-03-17 00:00:00
Negative age encountered: 2066-03-22 00:00:00
Negative age encountered: 2064-04-13 00:00:00
Negative age encountered: 2059-08-19 00:00:00
Negative age encountered: 2054-09-

In [None]:
def parse_date(date_str):
    try:
        dt = pd.to_datetime(date_str, format='%m/%d/%y')
        # If the year is after 2021, assume it's actually 100 years earlier
        if dt.year > 2021:
            dt = dt.replace(year=dt.year - 100)
        return dt
    except:
        try:
            return pd.to_datetime(date_str, format='%m-%d-%Y')
        except:
            print(f"Failed to parse date: {date_str}")  # Debug print
            return pd.NaT

df['DOB'] = df['DOB'].apply(parse_date)

reference_date = pd.Timestamp('2021-12-31')

def calculate_age(born):
    if pd.isnull(born):
        print(f"Null birthdate encountered")  # Debug print
        return None
    age = reference_date.year - born.year - ((reference_date.month, reference_date.day) < (born.month, born.day))
    return age

df['Age'] = df['DOB'].apply(calculate_age)

# Print summary statistics
print(df['Age'].value_counts())
print(f"Number of zero ages: {(df['Age'] == 0).sum()}")

# Print entries that were previously problematic
print("\nPreviously problematic entries:")
print(df.loc[df.index.isin([16, 21, 24, 25, 32, 35, 40, 41, 59, 61, 68, 69, 89, 98, 99, 112, 124, 131, 140, 142, 143, 150, 185, 201, 206, 225, 230, 244, 246, 257, 260, 263, 273, 277, 278, 279, 281, 282, 290, 296, 301, 304]), ['DOB', 'Age']])

Age
38    21
35    21
34    19
42    19
33    18
37    16
32    14
40    14
36    13
47    12
51    12
44    11
46     9
41     9
48     9
45     9
43     8
39     8
49     7
52     7
53     7
57     5
56     5
31     5
55     4
54     4
29     3
30     3
58     3
69     3
70     2
63     2
66     2
50     2
67     2
62     1
60     1
68     1
Name: count, dtype: int64
Number of zero ages: 0

Previously problematic entries:
           DOB  Age
16  1966-04-17   55
21  1958-12-27   63
24  1967-01-16   54
25  1964-07-30   57
32  1964-06-01   57
35  1966-11-22   55
40  1963-05-15   58
41  1951-01-02   70
59  1965-09-09   56
61  1952-01-18   69
68  1951-02-25   70
69  1967-04-19   54
89  1955-04-14   66
98  1963-08-28   58
99  1968-07-06   53
112 1964-10-12   57
124 1952-08-18   69
131 1966-03-17   55
140 1966-03-22   55
142 1964-04-13   57
143 1959-08-19   62
150 1954-09-21   67
185 1968-05-30   53
201 1967-06-03   54
206 1952-02-11   69
225 1961-06-19   60
230 1954-10-12   67
244 1964-01-

In [None]:
df['Age'].value_counts()


Unnamed: 0_level_0,count
Age,Unnamed: 1_level_1
38,21
35,21
34,19
42,19
33,18
37,16
32,14
40,14
36,13
47,12


In [None]:
def parse_date(date_str):
    if pd.isna(date_str):
        return pd.NaT

    date_formats = ['%m/%d/%Y', '%m-%d-%Y', '%m/%d/%y', '%m-%d-%y', '%d/%m/%Y', '%d-%m-%Y', '%d/%m/%y', '%d-%m-%y']

    for fmt in date_formats:
        try:
            dt = pd.to_datetime(date_str, format=fmt)
            # If the year is after 2021, assume it's actually 100 years earlier
            if dt.year > 2021:
                dt = dt.replace(year=dt.year - 100)
            return dt
        except:
            continue

    print(f"Failed to parse date: {date_str}")
    return pd.NaT

# Parse DateofHire and DateofTermination
df['DateofHire'] = df['DateofHire'].apply(parse_date)
df['DateofTermination'] = df['DateofTermination'].apply(parse_date)

# Define reference date
reference_date = pd.Timestamp('2021-04-30')

# Calculate Tenure
def calculate_tenure(row):
    if pd.isna(row['DateofHire']):
        return None

    if row['Termd'] == 1 and not pd.isna(row['DateofTermination']):
        tenure = (row['DateofTermination'] - row['DateofHire']).days / 365.25
    else:
        tenure = (reference_date - row['DateofHire']).days / 365.25

    return max(tenure, 0)  # Ensure tenure is not negative

df['Tenure'] = df.apply(calculate_tenure, axis=1)

# Print summary statistics
print(df['Tenure'].describe())

# Print a few sample rows to verify
print("\nSample rows:")
print(df[['DateofHire', 'DateofTermination', 'Termd', 'Tenure']].sample(10))

# Check for any remaining issues
print("\nRows with NaN Tenure:")
print(df[df['Tenure'].isna()][['DateofHire', 'DateofTermination', 'Termd', 'Tenure']])

print("\nRows with zero Tenure:")
print(df[df['Tenure'] == 0][['DateofHire', 'DateofTermination', 'Termd', 'Tenure']])

count    311.000000
mean       6.267610
std        2.800746
min        0.071184
25%        4.472279
50%        6.581793
75%        7.811088
max       15.304586
Name: Tenure, dtype: float64

Sample rows:
    DateofHire DateofTermination  Termd    Tenure
47  2011-10-03               NaT      0  9.574264
230 2014-05-12               NaT      0  6.967830
259 2011-09-26        2013-09-25      1  1.998631
36  2016-01-28               NaT      0  5.253936
13  2012-02-20               NaT      0  9.190965
52  2013-07-08               NaT      0  7.811088
130 2015-03-30               NaT      0  6.086242
247 2009-01-05        2018-07-30      1  9.563313
46  2011-06-27        2015-11-15      1  4.386037
290 2011-08-01               NaT      0  9.746749

Rows with NaN Tenure:
Empty DataFrame
Columns: [DateofHire, DateofTermination, Termd, Tenure]
Index: []

Rows with zero Tenure:
Empty DataFrame
Columns: [DateofHire, DateofTermination, Termd, Tenure]
Index: []


In [None]:
df['Tenure'].value_counts()

Unnamed: 0_level_0,count
Tenure,Unnamed: 1_level_1
6.086242,11
6.584531,10
6.316222,8
7.811088,8
7.581109,8
...,...
1.341547,1
3.463381,1
3.121150,1
7.071869,1


In [None]:
def parse_date(date_str):
    if pd.isna(date_str):
        return pd.NaT

    date_formats = ['%m/%d/%Y', '%m-%d-%Y', '%m/%d/%y', '%m-%d-%y', '%d/%m/%Y', '%d-%m-%Y', '%d/%m/%y', '%d-%m-%y']

    for fmt in date_formats:
        try:
            dt = pd.to_datetime(date_str, format=fmt)
            # If the year is after 2021, assume it's actually 100 years earlier
            if dt.year > 2021:
                dt = dt.replace(year=dt.year - 100)
            return dt
        except:
            continue

    print(f"Failed to parse date: {date_str}")
    return pd.NaT

# Parse LastPerformanceReview_Date if it hasn't been parsed already
if df['LastPerformanceReview_Date'].dtype == 'object':
    df['LastPerformanceReview_Date'] = df['LastPerformanceReview_Date'].apply(parse_date)

# Ensure DateofTermination is parsed (in case it wasn't done in the previous step)
if df['DateofTermination'].dtype == 'object':
    df['DateofTermination'] = df['DateofTermination'].apply(parse_date)

# Define reference date
reference_date = pd.Timestamp('2021-04-30')

# Calculate LastPerformanceReview_Days
def calculate_review_days(row):
    if pd.isna(row['LastPerformanceReview_Date']):
        return None

    if row['Termd'] == 1 and not pd.isna(row['DateofTermination']):
        days = (row['DateofTermination'] - row['LastPerformanceReview_Date']).days
    else:
        days = (reference_date - row['LastPerformanceReview_Date']).days

    return max(days, 0)  # Ensure the number of days is not negative

df['LastPerformanceReview_Days'] = df.apply(calculate_review_days, axis=1)

# Print summary statistics
print(df['LastPerformanceReview_Days'].describe())

# Print a few sample rows to verify
print("\nSample rows:")
print(df[['LastPerformanceReview_Date', 'DateofTermination', 'Termd', 'LastPerformanceReview_Days']].sample(10))

# Check for any remaining issues
print("\nRows with NaN LastPerformanceReview_Days:")
print(df[df['LastPerformanceReview_Days'].isna()][['LastPerformanceReview_Date', 'DateofTermination', 'Termd', 'LastPerformanceReview_Days']])

print("\nRows with zero LastPerformanceReview_Days:")
print(df[df['LastPerformanceReview_Days'] == 0][['LastPerformanceReview_Date', 'DateofTermination', 'Termd', 'LastPerformanceReview_Days']])

count    311.000000
mean     609.829582
std      312.773215
min        0.000000
25%      295.000000
50%      807.000000
75%      829.500000
max      849.000000
Name: LastPerformanceReview_Days, dtype: float64

Sample rows:
    LastPerformanceReview_Date DateofTermination  Termd  \
121                 2013-02-01        2014-10-31      1   
128                 2019-01-30               NaT      0   
29                  2019-01-15               NaT      0   
5                   2019-01-07               NaT      0   
208                 2019-02-07               NaT      0   
276                 2012-02-15        2013-04-01      1   
7                   2019-02-25               NaT      0   
159                 2019-02-04               NaT      0   
303                 2014-06-02        2015-06-27      1   
292                 2017-02-15        2017-02-22      1   

     LastPerformanceReview_Days  
121                         637  
128                         821  
29                       

In [None]:
df['LastPerformanceReview_Days'].value_counts()


Unnamed: 0_level_0,count
LastPerformanceReview_Days,Unnamed: 1_level_1
837,18
802,12
830,10
795,9
834,8
...,...
361,1
815,1
204,1
463,1


In [None]:
# List of columns to drop
columns_to_drop = [
    'Employee_Name', 'EmpID', 'MarriedID', 'MaritalStatusID', 'GenderID',
    'EmpStatusID', 'DeptID', 'PerfScoreID', 'DateofHire', 'DateofTermination',
    'FromDiversityJobFairID', 'PositionID', 'TermReason', 'ManagerID',
    'LastPerformanceReview_Date', 'DOB','State', 'Zip', 'CitizenDesc', 'HispanicLatino', 'RaceDesc', 'EmploymentStatus'
]

# Create new dataframe by dropping specified columns
df = df.drop(columns=columns_to_drop)

In [None]:
from sklearn.preprocessing import LabelEncoder


# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

print("Identified categorical columns:")
print(categorical_columns.tolist())

# Create a LabelEncoder object
le = LabelEncoder()

# Function to apply label encoding and handle unknown categories
def label_encode(df, column):
    df[column] = df[column].astype(str)  # Convert to string type to handle potential non-string categories
    le.fit(df[column].astype(str))
    df[column + '_encoded'] = le.transform(df[column].astype(str))
    # Create a mapping dictionary
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f"\nEncoding for {column}:")
    print(mapping)
    return df

# Apply label encoding to each categorical column
for col in categorical_columns:
    df = label_encode(df, col)


# Remove original categorical columns
df = df.drop(columns=categorical_columns)

Identified categorical columns:
['Position', 'Sex', 'MaritalDesc', 'Department', 'ManagerName', 'RecruitmentSource', 'PerformanceScore']

Encoding for Position:
{'Accountant I': np.int64(0), 'Administrative Assistant': np.int64(1), 'Area Sales Manager': np.int64(2), 'BI Developer': np.int64(3), 'BI Director': np.int64(4), 'CIO': np.int64(5), 'Data Analyst': np.int64(6), 'Data Analyst ': np.int64(7), 'Data Architect': np.int64(8), 'Database Administrator': np.int64(9), 'Director of Operations': np.int64(10), 'Director of Sales': np.int64(11), 'Enterprise Architect': np.int64(12), 'IT Director': np.int64(13), 'IT Manager - DB': np.int64(14), 'IT Manager - Infra': np.int64(15), 'IT Manager - Support': np.int64(16), 'IT Support': np.int64(17), 'Network Engineer': np.int64(18), 'President & CEO': np.int64(19), 'Principal Data Architect': np.int64(20), 'Production Manager': np.int64(21), 'Production Technician I': np.int64(22), 'Production Technician II': np.int64(23), 'Sales Manager': np.in

In [None]:
# Separating features(X) and target(y)
X = df.drop('Termd', axis=1)
y = df['Termd']

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def auto_scale(df):
    def decide_scaler(column):
        # Check for skewness
        skewness = column.skew()

        # Check for outliers using IQR method
        Q1 = column.quantile(0.25)
        Q3 = column.quantile(0.75)
        IQR = Q3 - Q1
        outlier_range = 1.5 * IQR
        outliers = ((column < (Q1 - outlier_range)) | (column > (Q3 + outlier_range))).sum()

        # If data is highly skewed or has many outliers, use MinMaxScaler
        if abs(skewness) > 1 or outliers > 0.1 * len(column):
            return MinMaxScaler()
        else:
            return StandardScaler()

    scaled_df = df.copy()

    for column in df.columns:
        if df[column].dtype in ['int64', 'float64']:
            scaler = decide_scaler(df[column])
            scaled_df[column] = scaler.fit_transform(df[column].values.reshape(-1, 1))

    return scaled_df

# Usage
X = auto_scale(X)

In [None]:
df.head()

Unnamed: 0,Salary,Termd,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,DaysLateLast30,Absences,Age,Tenure,LastPerformanceReview_Days,Position_encoded,Sex_encoded,MaritalDesc_encoded,Department_encoded,ManagerName_encoded,RecruitmentSource_encoded,PerformanceScore_encoded
0,62506,0,4.6,5,0,0,1,38,9.820671,834,22,1,3,3,17,5,0
1,104437,1,4.96,3,6,0,17,46,1.215606,113,30,1,1,2,19,4,1
2,64955,1,3.02,3,0,0,3,33,1.223819,132,23,0,1,3,15,5,1
3,64991,0,4.84,5,0,0,15,33,13.311431,848,22,0,1,3,8,4,1
4,50825,1,5.0,4,0,0,2,32,5.158111,218,22,0,0,3,20,3,1


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyRegressor

# Assuming your data is in a DataFrame called 'df'
# Separate features (X) and target (y)
# Let's predict 'Salary' - change if you want to predict something else
X = df.drop('Salary', axis=1)  # Features
y = df['Salary']               # Target variable

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and fit LazyRegressor
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None, predictions=True)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

# Show model performance comparison
print(models)

# Get the best performing model (top model by R-squared)
best_model_name = models.index[0]
best_model = reg.models[best_model_name]
print(f"\nBest model: {best_model_name}")

# Make prediction for new data
# Create a sample new data point with all required features
new_data = pd.DataFrame({
    'Termd': [0],
    'EngagementSurvey': [4.60],
    'EmpSatisfaction': [5],
    'SpecialProjectsCount': [0],
    'DaysLateLast30': [0],
    'Absences': [1],
    'Age': [38],
    'Tenure': [9.82],
    'LastPerformanceReview_Days': [834],
    'Position_encoded': [22],
    'Sex_encoded': [1],
    'MaritalDesc_encoded': [3],
    'Department_encoded': [3],
    'ManagerName_encoded': [17],
    'RecruitmentSource_encoded': [5],
    'PerformanceScore_encoded': [0]
})

# Ensure column order matches training data
new_data = new_data[X_train.columns]

# Make prediction
predicted_salary = best_model.predict(new_data)
print(f"\nPredicted Salary: ${predicted_salary[0]:,.2f}"

# To save the best model for later use:
import joblib
joblib.dump(best_model, 'best_salary_predictor.pkl')

  0%|          | 0/42 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 315
[LightGBM] [Info] Number of data points in the train set: 248, number of used features: 16
[LightGBM] [Info] Start training from score 68305.274194
                               Adjusted R-Squared  R-Squared     RMSE  \
Model                                                                   
HistGradientBoostingRegressor                0.51       0.63 17196.01   
ExtraTreesRegressor                          0.43       0.58 18492.90   
RandomForestRegressor                        0.40       0.55 19016.99   
BaggingRegressor                             0.39       0.55 19140.54   
AdaBoostRegressor                            0.37       0.53 19427.88   
LGBMRegressor                                0.37       0.53 19483.86   
GradientB

['best_salary_predictor.pkl']

In [None]:
# Sort by R-Squared (descending)
print(models.sort_values(by='R-Squared', ascending=False))

                               Adjusted R-Squared  R-Squared     RMSE  \
Model                                                                   
HistGradientBoostingRegressor                0.51       0.63 17196.01   
ExtraTreesRegressor                          0.43       0.58 18492.90   
RandomForestRegressor                        0.40       0.55 19016.99   
BaggingRegressor                             0.39       0.55 19140.54   
AdaBoostRegressor                            0.37       0.53 19427.88   
LGBMRegressor                                0.37       0.53 19483.86   
GradientBoostingRegressor                    0.34       0.51 19883.60   
KNeighborsRegressor                          0.30       0.48 20535.93   
PoissonRegressor                             0.23       0.43 21434.00   
SGDRegressor                                 0.21       0.41 21844.86   
TransformedTargetRegressor                   0.20       0.41 21881.27   
LinearRegression                             0.20  

In [None]:

]