In [1]:
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [11]:
import pandas as pd

file_path = '../data/Layoffs_Dataset.csv'

df = pd.read_csv(file_path)

In [15]:
import numpy as np

# Convert "Layoff Count" ranges to numeric averages
def convert_layoff_count(value):
    if pd.isna(value):
        return np.nan
    value = str(value).strip()
    if '–' in value:  # en dash
        try:
            low, high = map(int, value.replace(',', '').split('–'))
            return (low + high) / 2
        except:
            return np.nan
    elif '-' in value:  # hyphen
        try:
            low, high = map(int, value.replace(',', '').split('-'))
            return (low + high) / 2
        except:
            return np.nan
    elif '%' in value or not value.isdigit():
        return np.nan
    else:
        return float(value)

df['Layoff Count Clean'] = df['Layoff Count'].apply(convert_layoff_count)

# Extract year from 'Date of Layoff'
df['Layoff Year'] = pd.to_datetime(df['Date of Layoff'], errors='coerce').dt.year
df['Layoff Year'] = df['Layoff Year'].fillna(
    df['Date of Layoff'].str.extract(r'(\d{4})')[0].astype(float)
)

In [16]:
# Clustering: KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
df['Layoff Cluster'] = kmeans.fit_predict(df[['Layoff Count Clean']].fillna(0))

In [17]:
# Regression: Encode funding and industry
df_model = df[['Layoff Count Clean', 'Funding Status', 'Industry']].dropna()
le_funding = LabelEncoder()
le_industry = LabelEncoder()
df_model['Funding_Code'] = le_funding.fit_transform(df_model['Funding Status'])
df_model['Industry_Code'] = le_industry.fit_transform(df_model['Industry'])

In [18]:
# Model
X = df_model[['Funding_Code', 'Industry_Code']]
y = df_model['Layoff Count Clean']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = LinearRegression().fit(X_train, y_train)
print("Regression Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
print("R^2 Score:", model.score(X_test, y_test))

Regression Coefficients: [833.50063904 166.74762826]
Intercept: -1376.368559430343
R^2 Score: -0.1949543876000317
