In [1]:
import pandas as pd
import statsmodels.api as sm

# Load the dataset
csv_file_path = 'transformed_ebay_data.csv'  # Replace with your actual CSV file path
df = pd.read_csv(csv_file_path)

# Select predictors and target variable
predictors = [
    'remainder__Time duration',
    'num__Feedback Score',
    'cat__Listing Type_Auction',
    'cat__Listing Type_AuctionWithBIN',
    'cat__Listing Type_FixedPrice',
    'cat__Listing Type_StoreInventory',
    'cat__Shipping Type_Calculated',
    'cat__Shipping Type_CalculatedDomesticFlatInternational',
    'cat__Shipping Type_Flat'
]

# Prepare the data
X = df[predictors]
y = df['num__Price']

# Add a constant to the predictors for the intercept
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Display the regression summary
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:             num__Price   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     145.7
Date:                Sun, 24 Nov 2024   Prob (F-statistic):          1.93e-244
Time:                        23:56:05   Log-Likelihood:            -1.0828e+05
No. Observations:               76717   AIC:                         2.166e+05
Df Residuals:                   76708   BIC:                         2.167e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                                             coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------

## Significant Predictors (p-value < 0.05):
num__Feedback Score:

Coefficient: -0.0735
Interpretation: A higher feedback score decreases the price by approximately 0.0735 units, suggesting competitive pricing by sellers with high feedback scores.
cat__Listing Type_AuctionWithBIN:

Coefficient: -0.0316
Interpretation: Items listed as "Auction with Buy It Now" tend to have lower prices compared to other listing types.
cat__Shipping Type_Calculated:

Coefficient: 0.1957
Interpretation: Shipping types that are calculated tend to increase the price by approximately 0.1957 units.
cat__Shipping Type_Flat:

Coefficient: -0.2078
Interpretation: Flat-rate shipping reduces the price by approximately 0.2078 units.

## Non-Significant Predictors (p-value > 0.05):
remainder__Time duration:

Coefficient: -4.36e-05
Time duration has no significant impact on price after accounting for other predictors.
Other Listing Types:

These predictors (e.g., cat__Listing Type_FixedPrice, cat__Listing Type_StoreInventory) do not have a statistically significant relationship with price.

## Conclusions
Key Drivers of Price:

Feedback Score: Sellers with higher feedback scores tend to price items lower.
Shipping Type: Calculated shipping increases prices, while flat-rate shipping decreases them.
Auction with BIN: This listing type is associated with lower prices.
Limited Role of Time Duration:

Time duration does not have a meaningful impact on price in this context.
Model Fit:

The low R-squared indicates that there are other factors influencing price that are not included in the model.


## Next Steps
Address Multicollinearity:

Use variance inflation factor (VIF) analysis to identify and remove correlated predictors.
Explore Additional Predictors:

Include features such as product categories, item conditions, or market trends.
Refine the Model:

Consider transforming variables (e.g., log transformations) to improve the fit.
Test interaction terms between predictors (e.g., Feedback Score and Listing Type).

In [2]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Step 1: Calculate VIF for each predictor
X_vif = sm.add_constant(X)  # Add constant for VIF calculation
vif_data = pd.DataFrame()
vif_data["Feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) for i in range(X_vif.shape[1])]

# Display the VIF values
print(vif_data)

# Step 2: Remove predictors with high VIF
# Assuming a threshold of VIF > 10 for high multicollinearity
high_vif_features = vif_data[vif_data["VIF"] > 10]["Feature"].tolist()
X_reduced = X.drop(columns=high_vif_features, errors="ignore")

# Add constant to reduced predictors
X_reduced = sm.add_constant(X_reduced)

# Step 3: Refit the regression model with reduced predictors
model_reduced = sm.OLS(y, X_reduced).fit()

# Step 4: Display the regression summary
print(model_reduced.summary())


  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


                                             Feature       VIF
0                                              const  0.000000
1                           remainder__Time duration  1.097996
2                                num__Feedback Score  1.102330
3                          cat__Listing Type_Auction       inf
4                   cat__Listing Type_AuctionWithBIN       inf
5                       cat__Listing Type_FixedPrice       inf
6                   cat__Listing Type_StoreInventory       inf
7                      cat__Shipping Type_Calculated  1.127186
8  cat__Shipping Type_CalculatedDomesticFlatInter...  1.000841
9                            cat__Shipping Type_Flat  1.168007
                            OLS Regression Results                            
Dep. Variable:             num__Price   R-squared:                       0.014
Model:                            OLS   Adj. R-squared:                  0.014
Method:                 Least Squares   F-statistic:                  