In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

For questions 1 to 3:

Perform a linear regression to predict Y from X1, X2, and X3. Use the file homework_1.1.csv.

In [10]:
df = pd.read_csv('homework_1.1.csv', index_col=0)
df.head()

Unnamed: 0,X1,X2,X3,Y
0,-0.440646,-0.390227,0.156718,-0.877671
1,-3.810099,-1.304665,-1.105117,-10.130388
2,-1.425451,-0.340049,1.115908,0.284068
3,-1.32575,0.161906,-0.25467,-1.994344
4,3.120263,1.487343,-1.164839,2.03003


In [12]:
df.columns

Index(['X1', 'X2', 'X3', 'Y'], dtype='object')

In [22]:
from scipy import stats
X = df[['X1', 'X2', 'X3']]
y = df['Y']

# Create a Linear Regression model
linreg = LinearRegression()
linreg.fit(X, y)

# Get the coefficients and intercept
coefficients = linreg.coef_
intercept = linreg.intercept_

print(f"Coefficients: {coefficients}")
print(f"Intercept: {intercept}")

Coefficients: [1.00713766 1.96456859 2.97548854]
Intercept: 0.0026430033841689515


In [17]:

# Prepare arrays for the independent variables and the dependent variable
X1 = df[['X1']].values
X2 = df[['X2']].values
X3 = df[['X3']].values
Y = df['Y'].values

# Fit simple regressions
model_X1 = LinearRegression().fit(X1, Y)
model_X2 = LinearRegression().fit(X2, Y)
model_X3 = LinearRegression().fit(X3, Y)

# Fit multiple regression
X_all = df[['X1', 'X2', 'X3']].values
model_all = LinearRegression().fit(X_all, Y)

# Get coefficients
simple_coefs = {
    'X1': model_X1.coef_[0],
    'X2': model_X2.coef_[0],
    'X3': model_X3.coef_[0]
}

multiple_coefs = dict(zip(['X1', 'X2', 'X3'], model_all.coef_))

# Compute the absolute differences
differences = {xi: abs(simple_coefs[xi] - multiple_coefs[xi]) for xi in simple_coefs}

# Combine all in a DataFrame
coef_comparison = pd.DataFrame({
    'Simple Regression Coef': simple_coefs,
    'Multiple Regression Coef': multiple_coefs,
    'Absolute Difference': differences
})

In [18]:
coef_comparison.sort_values(by='Absolute Difference', ascending=False)

Unnamed: 0,Simple Regression Coef,Multiple Regression Coef,Absolute Difference
X2,4.083613,1.964569,2.119044
X1,1.841761,1.007138,0.834623
X3,3.097041,2.975489,0.121553


In [23]:
# Add intercept column to X
X = np.column_stack((np.ones(X_all.shape[0]), X_all))  # Adds intercept
n, p = X.shape

# Compute beta manually
beta_hat = np.linalg.inv(X.T @ X) @ X.T @ Y

# Residuals and variance estimate
residuals = Y - X @ beta_hat
sigma_squared = (residuals @ residuals) / (n - p)

# Variance-covariance matrix
var_beta_hat = sigma_squared * np.linalg.inv(X.T @ X)

# Standard errors
standard_errors = np.sqrt(np.diag(var_beta_hat))

# t-statistics
t_statistics = beta_hat / standard_errors

# Prepare output DataFrame
coef_labels = ['Intercept', 'X1', 'X2', 'X3']
t_stats_df = pd.DataFrame({
    'Coefficient': beta_hat,
    'Std Error': standard_errors,
    't-Statistic': t_statistics
}, index=coef_labels)

t_stats_df

Unnamed: 0,Coefficient,Std Error,t-Statistic
Intercept,0.002643,0.015904,0.166181
X1,1.007138,0.016515,60.984011
X2,1.964569,0.03687,53.283212
X3,2.975489,0.015131,196.64524


In [24]:
from scipy import stats

# Degrees of freedom
df_resid = n - p

# Compute two-tailed p-values
p_values = 2 * (1 - stats.t.cdf(np.abs(t_statistics), df=df_resid))

# Add to the existing DataFrame
t_stats_df['p-Value'] = p_values

t_stats_df

Unnamed: 0,Coefficient,Std Error,t-Statistic,p-Value
Intercept,0.002643,0.015904,0.166181,0.868049
X1,1.007138,0.016515,60.984011,0.0
X2,1.964569,0.03687,53.283212,0.0
X3,2.975489,0.015131,196.64524,0.0


X3 is the most significant predictor of Y, because it has the largest t-statistic (≈196.65), indicating the strongest signal relative to its standard error.

In [37]:
from sklearn.neighbors import NearestNeighbors

df2 = pd.read_csv('homework_1.2.csv', index_col=0)

# Split data by X value
df2_x0 = df2[df2['X'] == 0].reset_index(drop=True)
df2_x1 = df2[df2['X'] == 1].reset_index(drop=True)

# Fit NearestNeighbors on Z values from X == 0
nn = NearestNeighbors(n_neighbors=1)
nn.fit(df2_x0[['Z']])

# Find nearest neighbor in X=0 for each row in X=1 based on Z
distances, indices = nn.kneighbors(df2_x1[['Z']])

# Get matched rows
matches = df2_x0.iloc[indices.flatten()].reset_index(drop=True)

# Combine X=1 rows with their matched X=0 rows
matched_df = df2_x1.copy()
matched_df = matched_df.rename(columns={"Y": "Y_x1", "Z": "Z_x1"})
matched_df['Y_x0'] = matches['Y']
matched_df['Z_x0'] = matches['Z']
matched_df['Z_distance'] = distances.flatten()

# Show the matched dataframe
matched_df.head()

Unnamed: 0,X,Y_x1,Z_x1,Y_x0,Z_x0,Z_distance
0,1,1.215189,0.715189,0.716327,0.716327,0.001138
1,1,1.145894,0.645894,0.653108,0.653108,0.007214
2,1,0.937587,0.437587,0.437032,0.437032,0.000555
3,1,1.391773,0.891773,0.778157,0.778157,0.113616
4,1,1.463663,0.963663,0.778157,0.778157,0.185506


What is the effect? (The difference between the average Y value for X = 0 values vs. the average Y value for X = 1, where the X = 0 sample has the best match for each X = 1 value). So we use the matched sample of X = 0 and the full sample of X = 1.

In [38]:
# Compute the average Y for X=1 (original sample)
avg_y_x1 = matched_df['Y_x1'].mean()

# Compute the average Y for matched X=0 (best matches)
avg_y_x0_matched = matched_df['Y_x0'].mean()

# Compute the effect
effect = avg_y_x1 - avg_y_x0_matched

avg_y_x1, avg_y_x0_matched, effect

(1.1255971378548504, 0.5822370726634649, 0.5433600651913855)

What is the distance of the farthest match in this set?

In [39]:
# Find the maximum Z-distance among all matches
max_distance = matched_df['Z_distance'].max()
max_distance

0.2102170871093757

For questions 6 and 7:



Use NearestNeighbors to match data based on variables Z, given the file homework_1.2.csv. 
Try approach B: Pick all of the matches in X = 0 that are within a distance 0.2 of each X = 1. Duplicates are okay, in case a given sample with X = 0 is a good match for multiple items with X = 1. 

In [40]:
# Use NearestNeighbors with a radius search instead of n_neighbors
nn_radius = NearestNeighbors(radius=0.2)
nn_radius.fit(df2_x0[['Z']])

# For each X=1 row, find all neighbors within distance 0.2 from X=0
radius_matches = nn_radius.radius_neighbors(df2_x1[['Z']], return_distance=True)

# Count how many matches were found in total (including duplicates)
total_matches = sum(len(neighs) for neighs in radius_matches[1])

# Prepare list of all matched pairs
matched_pairs = []
for i, (dist_list, idx_list) in enumerate(zip(*radius_matches)):
    for dist, idx in zip(dist_list, idx_list):
        matched_pairs.append({
            'X1_index': i,
            'X1_Y': df2_x1.iloc[i]['Y'],
            'X1_Z': df2_x1.iloc[i]['Z'],
            'X0_Y': df2_x0.iloc[idx]['Y'],
            'X0_Z': df2_x0.iloc[idx]['Z'],
            'Z_distance': dist
        })

# Convert to DataFrame
matched_radius_df = pd.DataFrame(matched_pairs)
matched_radius_df.head()

Unnamed: 0,X1_index,X1_Y,X1_Z,X0_Y,X0_Z,Z_distance
0,0,1.215189,0.715189,0.548814,0.548814,0.166376
1,0,1.215189,0.715189,0.602763,0.602763,0.112426
2,0,1.215189,0.715189,0.544883,0.544883,0.170306
3,0,1.215189,0.715189,0.528895,0.528895,0.186294
4,0,1.215189,0.715189,0.568045,0.568045,0.147145


In [41]:
# Group by each X=1 observation and compute mean Y for its matched X=0 neighbors
grouped_effects = matched_radius_df.groupby('X1_index').agg(
    mean_Y_x1=('X1_Y', 'first'),   # same for all rows in the group
    mean_Y_x0=('X0_Y', 'mean')     # average across matched neighbors
)

# Compute the difference for each group
grouped_effects['effect'] = grouped_effects['mean_Y_x1'] - grouped_effects['mean_Y_x0']

# Compute the overall average effect across all groups
overall_effect = grouped_effects['effect'].mean()
overall_effect

0.5688516533881482