In [1]:
import pandas as pd
import numpy as np
#from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
import pandas as pd
import numpy as np


# Set a random seed for reproducibility
np.random.seed(42)

# Generate 1000 random employee IDs
employee_ids = np.arange(1, 1001)

# Generate random ages following a normal distribution
ages = np.random.normal(loc=35, scale=5, size=1000).astype(int)

# Generate random salaries following a right-skewed distribution
salaries = np.random.lognormal(mean=10, sigma=0.5, size=1000).astype(int)

# Generate random years of experience following a uniform distribution
experience_years = np.random.randint(0, 31, size=1000)

# Generate random performance scores following a normal distribution
performance_scores = np.random.normal(loc=75, scale=10, size=1000)

# Generate a categorical 'Location' variable with 3 locations
locations = np.random.choice(['NY', 'LA', 'Chicago'], size=1000)

# Generate random department labels for each employee
departments = np.random.choice(['HR', 'Finance', 'Marketing', 'Engineering', 'Sales'], size=1000)

# Create a DataFrame with the generated data
data = {
    'Employee_ID': employee_ids,
    'Age': ages,
    'Salary': salaries,
    'Experience_Years': experience_years,
    'Performance_Score': performance_scores,
    'Department': departments,  # Categorical variable
    'Location': locations  # New categorical variable 'Location'
}

df = pd.DataFrame(data)



In [3]:
def reg(data, independent_variables, dependent_variable, add_const=1, robust=0, se_calc='HC3', clustering_vars=None, filtering_conditions=None):
    """
    Perform linear regression with various options.
    
    Parameters:
        data (DataFrame): The dataset containing the variables.
        independent_variables (list): A list of independent variable names.
        dependent_variable (str): The name of the dependent variable.
        add_const (int, optional): Whether to add a constant term (intercept) to the regression. Default is 1 (add constant).
        robust (int, optional): Whether to use robust standard errors. Default is 0 (no robust standard errors).
        se_calc (str, optional): The type of standard errors to calculate. Default is 'HC3'.
        clustering_vars (list, optional): A list of variables to be used for clustering standard errors. Default is None (no clustering).
        filtering_conditions (list, optional): A list of filtering conditions to subset the data. Each condition should be a string. Default is None (no filtering).

    Returns:
        summary (Summary): A summary of the regression results.
    """
    
    # Apply filtering conditions to subset the data
    if filtering_conditions is not None:
        # Ensure filtering_conditions is a list
        if type(filtering_conditions) == str:
            filtering_conditions = [filtering_conditions]
        for cond in filtering_conditions:
            input_str = cond
            split_lst = input_str.split()
            string_operator = split_lst[1]
            # Filter the data based on the conditions
            eval(f'data["{split_lst[0]}"] {string_operator} "{split_lst[2]}"')
    
    # Extract independent and dependent variables
    X = data[independent_variables]
    y = data[dependent_variable]
    
    # Add a constant term if requested
    if add_const == 1:
        X = sm.add_constant(X)

    # Create the regression model
    model = sm.OLS(y, X)
    
    # Fit the model with various options
    if robust == 0 and clustering_vars is None:
        result = model.fit()
    elif clustering_vars is None:
        result = model.fit(cov_type=se_calc)
    else:
        # Ensure clustering_vars is a list
        if type(clustering_vars) == str:
            clustering_vars = [clustering_vars]
        for var in clustering_vars:
            # Create categorical codes for clustering
            data[var + '_group'] = data[var].astype('category').cat.codes
        # Fit the model with clustering standard errors
        result = model.fit(cov_type='cluster', cov_kwds={'groups': data[[var+"_group" for var in clustering_vars]]})
    
    # Return a summary of the regression results
    return result.summary()


reg(data = df, independent_variables='Salary',
    dependent_variable='Performance_Score', robust=1,
    filtering_conditions=["Location != NY",'Location != LA'],
    clustering_vars = "Department")


0,1,2,3
Dep. Variable:,Performance_Score,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.02232
Date:,"Sat, 20 Jan 2024",Prob (F-statistic):,0.888
Time:,12:13:55,Log-Likelihood:,-3732.0
No. Observations:,1000,AIC:,7468.0
Df Residuals:,998,BIC:,7478.0
Df Model:,1,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,75.2876,1.010,74.559,0.000,73.309,77.267
Salary,-5.786e-06,3.87e-05,-0.149,0.881,-8.17e-05,7.01e-05

0,1,2,3
Omnibus:,0.167,Durbin-Watson:,2.06
Prob(Omnibus):,0.92,Jarque-Bera (JB):,0.2
Skew:,0.03,Prob(JB):,0.905
Kurtosis:,2.967,Cond. No.,62800.0


In [9]:
def rreg(data, independent_variables, dependent_variable, add_const = 1, clustering_vars=None,
       filtering_conditions = None):
    if filtering_conditions != None:
        for cond in filtering_conditions:
            input_str = cond
            split_lst = input_str.split()
            string_operator = split_lst[1]
            eval(f'data["{split_lst[0]}"] {string_operator} "{split_lst[2]}"')
    X = data[independent_variables]
    y = data[dependent_variable]
    if add_const == 1:
        model = sm.RLM(y, sm.add_constant(X))
    else:
        model = sm.RLM(y, X)
    if clustering_vars==None:
        result = model.fit()
    else:
        for var in clustering_vars:
            data[var + '_group'] = data[var].astype('category').cat.codes
        result = model.fit(cov_type='cluster', 
                           cov_kwds={'groups': data[[var+"_group" for var in clustering_vars]]})
    return result.summary()

rreg(data = df, independent_variables=['Salary','Experience_Years']
    ,dependent_variable='Performance_Score',filtering_conditions=["Location == NY"],
    clustering_vars=['Department'])


TypeError: RLM.fit() got an unexpected keyword argument 'cov_type'

In [5]:
import pandas as pd
from linearmodels.panel import PanelOLS
from linearmodels.panel import FamaMacBeth

def rreg(data, independent_variables, dependent_variable, add_const=1, clustering_vars=None, filtering_conditions=None):
    if filtering_conditions is not None:
        for cond in filtering_conditions:
            input_str = cond
            split_lst = input_str.split()
            string_operator = split_lst[1]
            data = data[eval(f'data["{split_lst[0]}"] {string_operator} "{split_lst[2]}"')]
    
    X = data[independent_variables]
    y = data[dependent_variable]
    
    if add_const == 1:
        X = sm.add_constant(X)

    model = PanelOLS(y, X, entity_effects=True, time_effects=False)
    
    if clustering_vars is not None:
        cluster_entity = data[clustering_vars].astype('category')
        model = FamaMacBeth(y, X, entity_effects=True)
    
    results = model.fit(cov_type='kernel', kernel='bartlett', cluster_entity=cluster_entity)
    return results.summary

rreg(data=df, independent_variables=['Salary', 'Experience_Years'], dependent_variable='Performance_Score', filtering_conditions=["Location == 'NY'"], clustering_vars=['Department'])


ValueError: Series can only be used with a 2-level MultiIndex