In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Data preprocessing and EDA
file_path = "ebayAuctions.xlsx"
df = pd.read_excel(file_path, sheet_name='eBay auctions', header=0)
df.info()

In [None]:
df.columns = [col.strip() for col in df.columns]
df.columns = [col.replace("?","") for col in df.columns]
df.rename(columns={"sellerRating": "SellerRating", "endDay": "EndDay"}, inplace=True)
df.head()

In [None]:
df.info()
df.describe()

In [None]:
df['Competitive'] = df['Competitive'].astype(bool)

In [None]:
numeric = df.select_dtypes(include=[np.number]).columns.tolist()
categorical = df.select_dtypes(include=['object','category']).columns.tolist()

numeric, categorical

In [None]:
def mark_outliers(s, lower=0.01, upper=0.99):
    lo, hi = s.quantile([lower, upper])
    return (s < lo) | (s > hi)

# Only continuous features
cont_cols = ['OpenPrice', 'ClosePrice', 'SellerRating']

for c in cont_cols:
    df[f"{c}_outlier"] = mark_outliers(df[c])

In [None]:
dummies = pd.get_dummies(df[categorical], drop_first=False)
data = pd.concat([df[numeric + ['Competitive']], dummies], axis=1)
data.describe()
data.info()

In [None]:
fig1 = plt.figure(figsize=(6,4))
ax1 = fig1.add_subplot(1,1,1)
counts = df['Competitive'].value_counts(normalize=True).plot(kind='bar', color=['skyblue','salmon'])
plt.title("Competitive vs Non-Competitive Auctions")
plt.ylabel("Proportion")
plt.show()

In [None]:
fig2 = plt.figure(figsize=(6,4))
ax2 = fig2.add_subplot(1,1,1)
lp0 = (df.loc[df['Competitive']==0,'ClosePrice'].dropna())
lp1 = (df.loc[df['Competitive']==1,'ClosePrice'].dropna())
ax2.boxplot([lp0, lp1], labels=['Non-comp','Comp'])
ax2.set_title('ClosePrice by Competitiveness')
plt.tight_layout()
plt.show()

In [None]:
fig2 = plt.figure(figsize=(6,4))
ax2 = fig2.add_subplot(1,1,1)
lp0 = np.log1p(df.loc[df['Competitive']==0,'ClosePrice'].dropna())
lp1 = np.log1p(df.loc[df['Competitive']==1,'ClosePrice'].dropna())
ax2.boxplot([lp0, lp1], labels=['Non-comp','Comp'])
ax2.set_title('Log(1+ClosePrice) by Competitiveness')
plt.tight_layout()
plt.show()

In [None]:
for col in numeric:
    plt.figure(figsize=(6,4))
    sns.boxplot(x='Competitive', y=col, data=df)
    plt.title(f"{col} by Competitiveness")
    plt.show()

In [None]:
log_cols = ['OpenPrice', 'ClosePrice', 'SellerRating']

# Log-scale plots
for col in log_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x='Competitive', y=np.log1p(df[col]), data=df)
    plt.title(f"Log(1+{col}) by Competitiveness")
    plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(df[numeric + ['Competitive']].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
for col in categorical:
    comp_rates = df.groupby(col)['Competitive'].mean().sort_values(ascending=False) * 100
    plt.figure(figsize=(8,4))
    sns.barplot(x=comp_rates.index, y=comp_rates.values, palette="viridis")
    plt.title(f"% Competitive by {col}")
    plt.ylabel("% Competitive Auctions")
    plt.xticks(rotation=45, ha="right")
    plt.show()

In [None]:
topcats = df['Category'].value_counts().nlargest(10).index.tolist() # top 10 categories by frequency
# filter only those 10 categories
# average of "competitive" (True=1), % competitive
# reorder to match frequency ranking
perc_comp = (df[df['Category'].isin(topcats)].groupby(df['Category'])['Competitive'].mean() * 100).reindex(topcats)                      
fig3 = plt.figure(figsize=(10,4))
ax3 = fig3.add_subplot(1,1,1)
ax3.bar(range(len(topcats)), perc_comp)
ax3.set_xticks(range(len(topcats)))
ax3.set_xticklabels(topcats, rotation=45, ha='right')
ax3.set_ylabel('% Competitive auctions')
ax3.set_title(f"% Competitive by Category (top {len(topcats)})")
plt.tight_layout()
plt.show()

In [None]:
# Sometimes it’s not the absolute prices but the markup that drives competitiveness.
# If price_ratio ≈ 1, auction closed near opening price, not many bids.
# If price_ratio >> 1, price grew a lot, competitive bidding.
# Log-transform price ratio
df['PriceRatio'] = (df['ClosePrice'] + 1) / (df['OpenPrice'] + 1)
df['PriceRatio_log'] = np.log1p(df['PriceRatio'])
sns.boxplot(x='Competitive', y='PriceRatio_log', data=df)
plt.title("Log(1 + Price Ratio) by Competitiveness")
plt.show()


In [None]:
# Seller ratings are highly skewed; cut them into bins (newbie, mid, pro, power seller).
# Competitive auctions might be concentrated in trusted sellers.
df['SellerTier'] = pd.qcut(df['SellerRating'], q=4, labels=["Low","Mid","High","Top"])
sns.barplot(x='SellerTier', y='Competitive', data=df, estimator=np.mean)
plt.title("Competitiveness by Seller Tier")
plt.show()

In [None]:
# Auctions of 3, 5, 7, 10 days may show very different competitiveness rates.
# A short 3-day auction might get fewer bids, but 7-day might build hype.
sns.barplot(x='Duration', y='Competitive', data=df, estimator=np.mean)
plt.title("Competitiveness by Auction Duration")
plt.show()

In [None]:
# Maybe high-rated sellers dominate competitive categories (e.g. Electronics, Jewelry).
# Heatmap can reveal that interaction.
pivot = df.pivot_table(index="Category", columns="SellerTier", values="Competitive", aggfunc="mean")
plt.figure(figsize=(10,6))
sns.heatmap(pivot, annot=True, fmt=".1%", cmap="YlGnBu")
plt.title("Competitiveness by Category & Seller Tier")
plt.show()

In [None]:
# Decision Trees
import graphviz
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
!pip install dmba
from dmba import classificationSummary

In [None]:
data.info()

In [None]:
bool_cols = data.select_dtypes(include='bool').columns
data[bool_cols] = data[bool_cols].astype(int)

In [None]:
X = data.drop(columns=["Competitive"])
y = data["Competitive"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=1)
fullClassTree = DecisionTreeClassifier(random_state=1, min_samples_leaf=50) 
fullClassTree.fit(X_train, y_train)
y_predicted = fullClassTree.predict(X_test)
accuracy_score(y_test, y_predicted)

In [None]:
classificationSummary(y_train, fullClassTree.predict(X_train))
classificationSummary(y_test, fullClassTree.predict(X_test))

In [None]:
feature_names = list(X.columns)
list_int = list(y.unique())
class_names = list(map(str, list_int))
text_representation = tree.export_text(fullClassTree, feature_names = feature_names)
print(text_representation)

In [None]:
fig = plt.figure(figsize=(25,20))
tree.plot_tree(fullClassTree, feature_names=feature_names, class_names=class_names,
              rounded=True, filled=True)
%matplotlib inline
plt.show()

In [None]:
feature_importances = pd.Series(fullClassTree.feature_importances_, index=X_train.columns)
feature_importances.sort_values(ascending=False)

In [None]:
# For new auctions
# For better visualization, log-transform SellerRating and OpenPrice
data_clean = data
for c in ["OpenPrice","SellerRating"]:
    data_clean[f"{c}_log"] = np.log(data_clean[c]+1)

X = data_clean[["OpenPrice_log", "SellerRating_log"]]
y = data_clean["Competitive"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=1)
newClassTree = DecisionTreeClassifier(random_state=1, min_samples_leaf=50) 
newClassTree.fit(X_train, y_train)
y_predicted = newClassTree.predict(X_test)
accuracy_score(y_test, y_predicted)

In [None]:
classificationSummary(y_train, newClassTree.predict(X_train))
classificationSummary(y_test, newClassTree.predict(X_test))

In [None]:
feature_names = list(X.columns)
list_int = list(y.unique())
class_names = list(map(str, list_int))
text_representation = tree.export_text(newClassTree, feature_names = feature_names)
print(text_representation)

In [None]:
fig = plt.figure(figsize=(25,20))
tree.plot_tree(newClassTree, feature_names=feature_names, class_names=class_names,
              rounded=True, filled=True)
%matplotlib inline
plt.show()

In [None]:
# Decision Tree boundary over scatter plot
x1 = X_test["OpenPrice_log"]
x2 = X_test["SellerRating_log"]

x1_range = np.linspace(x1.min(), x1.max(), 100)
x2_range = np.linspace(x2.min(), x2.max(), 100)
xx, yy = np.meshgrid(x1_range, x2_range)
grid_points = np.c_[xx.ravel(), yy.ravel()]
Z = newClassTree.predict(grid_points)
Z = Z.reshape(xx.shape)

plt.figure(figsize=(8,6)) 

plt.contourf(xx, yy, Z, levels=10, cmap="coolwarm", alpha=0.2)

plt.scatter(x1[y_test==0], x2[y_test==0], color='blue', label='Non-Competitive', alpha=0.6)
plt.scatter(x1[y_test==1], x2[y_test==1], color='red', label='Competitive', alpha=0.6)

plt.xlabel('OpenPrice_log')
plt.ylabel('SellerRating_log')
plt.title('Auctions by Log SellerRating and Log OpenPrice')
plt.legend()
plt.show()