<a href="https://colab.research.google.com/github/shrbh025/Layoffs-EDA-and-Prediction/blob/main/Layoffs_Predictive_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cleansing and data transformation

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import matplotlib.pyplot as plt

In [None]:
layoff_df = pd.read_csv('layoffs_data.csv')
layoff_df.columns

Index(['Company', 'Location_HQ', 'Industry', 'Laid_Off_Count', 'Percentage',
       'Date', 'Source', 'Funds_Raised', 'Stage', 'Date_Added', 'Country',
       'List_of_Employees_Laid_Off'],
      dtype='object')

**List_of_Employees_Laid_Off** and **Source**: Contain links to documents or news articles

**Date_Added**: This field contains when the data was added to the dataset

**Location_HQ**: The headquarters location does not impact the predictive analysis

In [None]:
layoff_df = layoff_df.drop(columns=["List_of_Employees_Laid_Off", "Source", "Date_Added"])

Dropping all **NaN value rows** for columns that are essential for Time series analysis

In [None]:
layoff_df = layoff_df[layoff_df["Percentage"].notna()]
layoff_df = layoff_df[layoff_df["Laid_Off_Count"].notna()]

Removing **invalid** values where percentage is zero, as in a dataset of Layoffs we cannot have a record with zero layoffs

In [None]:
layoff_df.query('Percentage == 0')

Unnamed: 0,Company,Location_HQ,Industry,Laid_Off_Count,Percentage,Date,Funds_Raised,Stage,Country
2431,TaskUs,Los Angeles,Support,52.0,0.0,2022-06-21,279.0,Post-IPO,United States


# Time Series Data Preparation

In [None]:
ts_df = layoff_df[["Date", "Laid_Off_Count"]]

ts_df = ts_df.groupby("Date")["Laid_Off_Count"].sum().reset_index()

ts_df = ts_df.sort_values(by="Date")
ts_df

Unnamed: 0,Date,Laid_Off_Count
0,2020-03-12,20.0
1,2020-03-13,14.0
2,2020-03-16,146.0
3,2020-03-18,4.0
4,2020-03-19,134.0
...,...,...
533,2024-01-22,530.0
534,2024-01-23,10082.0
535,2024-01-24,80.0
536,2024-01-25,465.0


In [None]:
#Determing rolling statistics
moving_avg = ts_df.rolling(14).mean()
moving_std = ts_df.rolling(14).std()

import plotly.express as px
plot_df = pd.DataFrame({"actual": ts_df["Laid_Off_Count"], "moving_avg": moving_avg["Laid_Off_Count"] , "moving_std": moving_std["Laid_Off_Count"]})
date_layoffs = (plot_df)

fig_date = px.line(
    date_layoffs,
    x = ts_df.Date,
    y = plot_df.columns,
    title = "Rolling Mean & Standard Deviation",
    color_discrete_map={
                 "moving_avg": "green",
                 "actual": "gray",
                 "moving_std": "red"
             },
    template = 'plotly_dark'
)

fig_date.update_layout(title_x = 0.5)
fig_date.show()

  moving_avg = ts_df.rolling(14).mean()
  moving_std = ts_df.rolling(14).std()


In [None]:
y = ts_df
y["Laid_Off_Count"] = y.rolling(14).mean()

# convert the date information to a datetime object
y['Date'] = pd.to_datetime(y["Date"])
y = y.set_index('Date')
y = y.dropna()


Dropping of nuisance columns in rolling operations is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the operation. Dropped columns were Index(['Date'], dtype='object')



In [None]:
!pip install pmdarima
import pmdarima

from pmdarima import auto_arima

Collecting pmdarima
  Downloading pmdarima-2.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pmdarima
Successfully installed pmdarima-2.0.4


In [None]:
y = y.dropna()
test = y.loc['2023-11-01':]
train = y.loc[:'2023-11-01']

**Dickey-Fuller Test:**

In [None]:
# H0: Not Stationary
# H1: Stationary

from statsmodels.tsa.stattools import adfuller
print ('Results of Dickey-Fuller Test:')
y = y.dropna()
dftest = adfuller(y, autolag='AIC')
pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])

Results of Dickey-Fuller Test:


Test Statistic                  -3.004118
p-value                          0.034512
#Lags Used                      17.000000
Number of Observations Used    507.000000
dtype: float64

Since the p-value is less than .05, we reject the null hypothesis.

This means the time series is stationary.

In other words, it does not have time-dependent structure and has constant variance over time.

# Time Series Forecasting

In [None]:
model = auto_arima(train, stationary=True, seasonal=True, trace=True, error_action='ignore', suppress_warnings=True)

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=6406.547, Time=1.66 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=7873.695, Time=0.03 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=6418.947, Time=0.05 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.21 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=8288.617, Time=0.03 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=6419.665, Time=0.27 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=6420.205, Time=1.52 sec
 ARIMA(3,0,2)(0,0,0)[0] intercept   : AIC=6413.577, Time=6.72 sec
 ARIMA(2,0,3)(0,0,0)[0] intercept   : AIC=6403.491, Time=5.01 sec
 ARIMA(1,0,3)(0,0,0)[0] intercept   : AIC=6420.198, Time=1.93 sec
 ARIMA(3,0,3)(0,0,0)[0] intercept   : AIC=inf, Time=7.17 sec
 ARIMA(2,0,4)(0,0,0)[0] intercept   : AIC=6402.557, Time=4.37 sec
 ARIMA(1,0,4)(0,0,0)[0] intercept   : AIC=6418.979, Time=2.53 sec
 ARIMA(3,0,4)(0,0,0)[0] intercept   : AIC=inf, Time=7.43 sec
 ARIMA(2,0,5)(0,0,0)[0] intercept   : AIC=6399.2

In [None]:
arima_pred = model.predict(n_periods=test.shape[0])

arima_pred_df = pd.DataFrame({"Laid_Off_Count": arima_pred.values}, index=test.index)


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



In [None]:
from  sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
import math

mape = mean_absolute_percentage_error(test, arima_pred)
rmse = math.sqrt(mean_squared_error(test, arima_pred))

print('MAPE: ',round(mape, 2))
print('RMSE: ',round(rmse, 2))

MAPE:  0.37
RMSE:  264.04


In [None]:
import plotly.express as px
final_df = pd.DataFrame({"train split": train["Laid_Off_Count"], "test split": test["Laid_Off_Count"], "prediction": arima_pred_df["Laid_Off_Count"]})
date_layoffs = (final_df)

fig_date = px.line(
    date_layoffs,
    x = final_df.index,
    y = final_df.columns,
    title = 'Trend for Layoffs',
    template = 'plotly_dark'
)

fig_date.update_layout(title_x = 0.5)
fig_date.show()

In [None]:
y.index[-1]

Timestamp('2024-01-26 00:00:00')

# **The Future**

In [None]:
from pandas.tseries.offsets import DateOffset
model = auto_arima(y, stationary=True, seasonal=True, trace=True, error_action='ignore', suppress_warnings=True)

future_dates = [y.index[-1] + DateOffset(days=x)for x in range(0, 120)]
future_df = pd.DataFrame(index=future_dates[1:],columns=y.columns)

arima_pred = model.predict(n_periods=future_df.shape[0])
arima_pred_df = pd.DataFrame({"Laid_Off_Count": arima_pred.values}, index=future_df.index)

import plotly.express as px
final_df = pd.DataFrame({"Past": y["Laid_Off_Count"], "Future": arima_pred_df["Laid_Off_Count"]})
final_df
date_layoffs = (final_df)

fig_date = px.line(
    date_layoffs,
    x = final_df.index,
    y = final_df.columns,
    title = 'Layoffs Future prediction',
    template = 'plotly_dark'
)

fig_date.update_layout(title_x = 0.5)
fig_date.show()

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0] intercept   : AIC=6810.000, Time=3.70 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=8362.354, Time=0.11 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=6823.761, Time=0.22 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=inf, Time=0.56 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=8805.401, Time=0.11 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=6824.302, Time=0.64 sec
 ARIMA(2,0,1)(0,0,0)[0] intercept   : AIC=6825.004, Time=1.52 sec
 ARIMA(3,0,2)(0,0,0)[0] intercept   : AIC=inf, Time=3.99 sec
 ARIMA(2,0,3)(0,0,0)[0] intercept   : AIC=6806.812, Time=2.98 sec
 ARIMA(1,0,3)(0,0,0)[0] intercept   : AIC=6824.581, Time=0.31 sec
 ARIMA(3,0,3)(0,0,0)[0] intercept   : AIC=6815.038, Time=1.67 sec
 ARIMA(2,0,4)(0,0,0)[0] intercept   : AIC=6805.910, Time=1.33 sec
 ARIMA(1,0,4)(0,0,0)[0] intercept   : AIC=6823.719, Time=0.84 sec
 ARIMA(3,0,4)(0,0,0)[0] intercept   : AIC=inf, Time=1.84 sec
 ARIMA(2,0,5)(0,0,0)[0] intercept   : AIC=6801.6


No supported index is available. Prediction results will be given with an integer index beginning at `start`.


No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.



# Classification

**Data Preparation**

In [None]:
layoff_cf_df = layoff_df.copy()

In [None]:
def risk_levels(row):
   if row['Percentage'] >= layoff_cf_df.Percentage.mean():
      return bool(True)
   else:
      return bool(False)

layoff_cf_df['Risk'] = layoff_cf_df.apply(risk_levels, axis=1)

In [None]:
layoff_cf_df = layoff_cf_df.drop(columns=["Company", "Laid_Off_Count", "Percentage", "Date"])
layoff_cf_df = layoff_cf_df.dropna()
layoff_cf_df

Unnamed: 0,Location_HQ,Industry,Funds_Raised,Stage,Country,Risk
0,SF Bay Area,Sales,65.0,Post-IPO,United States,False
3,Bengaluru,Food,3600.0,Unknown,India,False
4,Boulder,Logistics,299.0,Series B,United States,False
9,Stockholm,Media,275.0,Post-IPO,Sweden,False
13,Walldorf,Other,1300.0,Post-IPO,Germany,False
...,...,...,...,...,...,...
3367,Denver,Travel,79.0,Series C,United States,False
3368,Austin,Support,6.0,Seed,United States,True
3370,Los Angeles,Transportation,45.0,Unknown,United States,False
3371,SF Bay Area,Consumer,1.0,Seed,United States,True


In [None]:
layoff_cf_df.nunique()

Location_HQ     130
Industry         31
Funds_Raised    544
Stage            16
Country          37
Risk              2
dtype: int64

When we convert these features to dummy variables to make the dataset numerical, it can lead to a very high number of columns. So there is a need to **reduce some level of granularity**

In [None]:
layoff_cf_df.Stage.unique()

array(['Post-IPO', 'Unknown', 'Series B', 'Series D', 'Acquired',
       'Series C', 'Series H', 'Series F', 'Series E', 'Series A',
       'Series G', 'Private Equity', 'Seed', 'Subsidiary', 'Series I',
       'Series J'], dtype=object)

In [None]:
layoff_cf_df.loc[layoff_cf_df['Stage'].str.startswith('Series'), 'Stage'] = 'Series'
layoff_cf_df = layoff_cf_df.sample(frac=1).reset_index(drop=True)

X = layoff_cf_df.drop(columns=['Risk'])
X = pd.get_dummies(X, prefix=['Stage', 'Industry', 'Country', 'Location_HQ'], columns=['Stage', 'Industry', 'Country', 'Location_HQ'])
y = layoff_cf_df['Risk']

scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.3, random_state=42)

**Logistic**

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.2f}")
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.66
Logistic Regression Classification Report:
              precision    recall  f1-score   support

       False       0.72      0.85      0.78       301
        True       0.36      0.20      0.26       127

    accuracy                           0.66       428
   macro avg       0.54      0.53      0.52       428
weighted avg       0.61      0.66      0.62       428



**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split

# Create and train the Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='recall')

# Fit the model to the training data
clf = grid_search.fit(X_train, y_train)

from sklearn.metrics import accuracy_score, classification_report
# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Predict on the test set using the best estimator
y_pred = best_estimator.predict(X_test)

# Evaluate the model
report = classification_report(y_test, y_pred)
print("Random Forest Classification Report:\n", report)

Best Parameters: {'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 200}
Random Forest Classification Report:
               precision    recall  f1-score   support

       False       0.78      0.86      0.82       301
        True       0.56      0.42      0.48       127

    accuracy                           0.73       428
   macro avg       0.67      0.64      0.65       428
weighted avg       0.71      0.73      0.72       428

