Imports and Load Dataset

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('../data/layoffs.csv')
df.head()


Unnamed: 0,company,location,total_laid_off,date,percentage_laid_off,industry,source,stage,funds_raised,country,date_added
0,Smartsheet,Seattle,120.0,10/2/2025,,Other,https://www.geekwire.com/2025/smartsheet-cuts-...,Post-IPO,$152,United States,10/6/2025
1,Google,SF Bay Area,50.0,10/2/2025,,Consumer,https://www.sfchronicle.com/tech/article/googl...,Post-IPO,$26,United States,10/6/2025
2,Paycom,Oklahoma City,500.0,10/1/2025,,HR,https://www.oklahoman.com/story/business/infor...,Post-IPO,,United States,10/1/2025
3,Google,SF Bay Area,100.0,10/1/2025,,Consumer,https://www.cnbc.com/2025/10/01/google-cloud-u...,Post-IPO,$26,United States,10/6/2025
4,Simpl,"Bengaluru,Non-U.S.",80.0,10/1/2025,,Finance,https://inc42.com/buzz/bnpl-startup-simpl-lays...,Series B,$72,India,10/1/2025


Basic Info and Cleanup

In [7]:
# Overview
print(df.shape)
df.info()
df.describe()

# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Check for nulls
df.isnull().sum().sort_values(ascending=False).head(10)


(4178, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4178 entries, 0 to 4177
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   company              4178 non-null   object 
 1   location             4177 non-null   object 
 2   total_laid_off       2727 non-null   float64
 3   date                 4178 non-null   object 
 4   percentage_laid_off  2641 non-null   object 
 5   industry             4176 non-null   object 
 6   source               4175 non-null   object 
 7   stage                4173 non-null   object 
 8   funds_raised         3711 non-null   object 
 9   country              4176 non-null   object 
 10  date_added           4178 non-null   object 
dtypes: float64(1), object(10)
memory usage: 359.2+ KB


percentage_laid_off    1537
total_laid_off         1451
funds_raised            467
stage                     5
source                    3
country                   2
industry                  2
location                  1
company                   0
date                      0
dtype: int64

Handle Missing Data

In [8]:
# Fill missing industry names with 'Unknown'
df['industry'] = df['industry'].fillna('Unknown')

# Drop rows with missing company or date (essential fields)
df.dropna(subset=['company', 'date'], inplace=True)

# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])


Feature Engineering

In [9]:
# Extract year and month
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month_name()

# Ensure total_laid_off column is numeric
df['total_laid_off'] = pd.to_numeric(df['total_laid_off'], errors='coerce').fillna(0)

# If there’s a percentage_laid_off column already, keep it clean
if 'percentage_laid_off' in df.columns:
    df['percentage_laid_off'] = pd.to_numeric(df['percentage_laid_off'], errors='coerce').fillna(0)

# Final check
df.head()


Unnamed: 0,company,location,total_laid_off,date,percentage_laid_off,industry,source,stage,funds_raised,country,date_added,year,month
0,Smartsheet,Seattle,120.0,2025-10-02,0.0,Other,https://www.geekwire.com/2025/smartsheet-cuts-...,Post-IPO,$152,United States,10/6/2025,2025,October
1,Google,SF Bay Area,50.0,2025-10-02,0.0,Consumer,https://www.sfchronicle.com/tech/article/googl...,Post-IPO,$26,United States,10/6/2025,2025,October
2,Paycom,Oklahoma City,500.0,2025-10-01,0.0,HR,https://www.oklahoman.com/story/business/infor...,Post-IPO,,United States,10/1/2025,2025,October
3,Google,SF Bay Area,100.0,2025-10-01,0.0,Consumer,https://www.cnbc.com/2025/10/01/google-cloud-u...,Post-IPO,$26,United States,10/6/2025,2025,October
4,Simpl,"Bengaluru,Non-U.S.",80.0,2025-10-01,0.0,Finance,https://inc42.com/buzz/bnpl-startup-simpl-lays...,Series B,$72,India,10/1/2025,2025,October


Save Clean Data

In [11]:
df.to_csv('../reports/cleaned_layoffs.csv', index=False)
print("✅ Cleaned dataset saved successfully (company size not required).")


✅ Cleaned dataset saved successfully (company size not required).
