In [None]:
import numpy as np
import pandas as pd

print(np.__version__)
print(pd.__version__)


In [None]:
import pandas as pd

df = pd.read_csv("../data/depop_vintage_womens_tees.csv")
df.head()


In [None]:
import os
os.getcwd()


In [None]:
df = pd.read_csv("data/depop_vintage_womens_tees.csv")
df.head()


In [None]:
import pandas as pd

df = pd.read_csv("data/depop_vintage_womens_tees.csv")
df.head()

In [None]:
import os

os.listdir()


In [None]:
import os
os.listdir("data")


In [None]:
df = pd.read_csv("data/depop_vintage_womens_tees.csv")
df.head()


In [None]:
df["condition"] = df["condition"].str.lower()
df["brand"] = df["brand"].str.lower()


In [None]:
df.head()


In [None]:
df.info()


In [None]:
df.describe()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.scatterplot(data=df, x="price", y="likes")
plt.title("Likes vs Price")
plt.show()


Engagement generally decreases as price increases, though some higher-priced listings still receive strong engagement, suggesting brand and condition influence visibility beyond price alone.

In [None]:
sns.boxplot(data=df, x="condition", y="likes")
plt.title("Likes by Condition")
plt.show()


Listings in better condition categories tend to receive higher engagement, though overlap exists across conditions.

In [None]:
top_brands = df["brand"].value_counts().head(10).index

sns.boxplot(
    data=df[df["brand"].isin(top_brands)],
    x="brand",
    y="likes"
)
plt.xticks(rotation=45)
plt.title("Likes by Brand (Top 10)")
plt.show()


Certain brands consistently outperform others in engagement, even at similar price points.

In [None]:
median_likes = df["likes"].median()
df["high_engagement"] = (df["likes"] >= median_likes).astype(int)

df["high_engagement"].value_counts()


Listings were labeled as high or low engagement based on the median number of likes, enabling future classification modeling.

In [None]:
import numpy as np

# 1) Standardize to string, strip spaces
df["likes_raw"] = df["likes"]  # keep original for transparency
df["likes"] = df["likes"].astype(str).str.strip()

# 2) Convert "99+" -> 99 (cap)
df["likes"] = df["likes"].str.replace("+", "", regex=False)

# 3) Convert to numeric; invalid -> NaN
df["likes"] = pd.to_numeric(df["likes"], errors="coerce")

# 4) Decide how to handle missing likes (recommended: fill with 0)
df["likes"] = df["likes"].fillna(0).astype(int)

df[["likes_raw", "likes"]].head(15)


In [None]:
median_likes = df["likes"].median()
df["high_engagement"] = (df["likes"] >= median_likes).astype(int)

median_likes, df["high_engagement"].value_counts()


In [None]:
(df["likes_raw"].astype(str).str.contains("99", na=False)).sum()


In [None]:
df["likes"].describe()


In [None]:
df["days_since_posted"] = pd.to_numeric(df["days_since_posted"], errors="coerce").fillna(df["days_since_posted"].median())
df["price"] = pd.to_numeric(df["price"], errors="coerce")


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression

X = df[["price", "condition", "brand", "days_since_posted"]]
y_reg = df["likes"]
y_clf = df["high_engagement"]

categorical = ["condition", "brand"]
numeric = ["price", "days_since_posted"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", "passthrough", numeric),
    ]
)


In [None]:
df.shape


In [None]:
# Regression: predict likes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

X = df[["price", "condition", "brand", "days_since_posted"]]
y = df["likes"]

categorical = ["condition", "brand"]
numeric = ["price", "days_since_posted"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", "passthrough", numeric),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

reg_model = Pipeline(
    steps=[
        ("prep", preprocessor),
        ("model", LinearRegression()),
    ]
)

reg_model.fit(X_train, y_train)
preds = reg_model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mae


In [None]:
X = df[["price", "condition", "brand", "days_since_posted"]]
X.isna().sum()


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical = ["condition", "brand"]
numeric = ["price", "days_since_posted"]

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median"))
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical),
        ("num", numeric_transformer, numeric),
    ]
)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

X = df[["price", "condition", "brand", "days_since_posted"]]
y = df["likes"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

reg_model = Pipeline(
    steps=[
        ("prep", preprocessor),
        ("model", LinearRegression()),
    ]
)

reg_model.fit(X_train, y_train)
preds = reg_model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
mae


In [None]:
X = df[["price", "condition", "brand", "days_since_posted"]]
y_reg = df["likes"]

print("X NaNs:\n", X.isna().sum(), "\n")
print("y_reg NaNs:", y_reg.isna().sum())


In [None]:
# Drop rows that are completely blank
df = df.dropna(how="all")

# Drop rows missing essential columns
df = df.dropna(subset=["price", "condition", "brand", "days_since_posted", "likes"])

df.shape


In [None]:
X = df[["price", "condition", "brand", "days_since_posted"]]
y_reg = df["likes"]

print("X NaNs:\n", X.isna().sum(), "\n")
print("y_reg NaNs:", y_reg.isna().sum())


In [None]:
median_likes = df["likes"].median()
df["high_engagement"] = (df["likes"] >= median_likes).astype(int)

median_likes, df["high_engagement"].value_counts()


In [None]:
import pandas as pd
import numpy as np

# 1) remove completely blank rows (safe)
df = df.dropna(how="all")

# 2) standardize likes -> numeric
df["likes"] = df["likes"].astype(str).str.strip()
df["likes"] = df["likes"].str.replace("+", "", regex=False)   # "99+" -> "99"
df["likes"] = pd.to_numeric(df["likes"], errors="coerce")     # strings -> numbers
df = df.dropna(subset=["likes"])                               # drop any weird leftovers
df["likes"] = df["likes"].astype(int)

# 3) (optional but recommended) ensure numeric fields are numeric
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df["days_since_posted"] = pd.to_numeric(df["days_since_posted"], errors="coerce")

# 4) drop rows missing essentials for modeling
df = df.dropna(subset=["price", "condition", "brand", "days_since_posted"])

df[["likes", "price", "days_since_posted"]].dtypes, df.shape


In [None]:
median_likes = df["likes"].median()
df["high_engagement"] = (df["likes"] >= median_likes).astype(int)

median_likes, df["high_engagement"].value_counts()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

X = df[["price", "condition", "brand", "days_since_posted"]]
y = df["likes"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

reg_model = Pipeline(
    steps=[
        ("prep", preprocessor),
        ("model", LinearRegression()),
    ]
)

reg_model.fit(X_train, y_train)
preds = reg_model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mae


### Regression Results
The regression model predicts listing engagement (likes) with a mean absolute
error of approximately 23 likes. Given the long-tailed and noisy nature of
marketplace engagement data, this provides a reasonable baseline rather than
precise prediction.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

y = df["high_engagement"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

clf_model = Pipeline(
    steps=[
        ("prep", preprocessor),
        ("model", LogisticRegression(max_iter=1000)),
    ]
)

clf_model.fit(X_train, y_train)
preds = clf_model.predict(X_test)

acc = accuracy_score(y_test, preds)
baseline = y_test.mean()

acc, baseline


### Classification Results
The classifier outperformed a naive baseline, indicating that pricing and
listing metadata (price, brand, condition, and time since posting) contain
meaningful signal for predicting engagement.


In [None]:
import pandas as pd

feature_names = clf_model.named_steps["prep"].get_feature_names_out()
coefs = clf_model.named_steps["model"].coef_[0]

importance = (
    pd.DataFrame({"feature": feature_names, "coef": coefs})
      .sort_values("coef", ascending=False)
)

importance.head(10)


### Feature Importance Insights
Brand recognition and item condition were the strongest predictors of
engagement. Listings associated with recognizable brands showed a higher
likelihood of strong engagement, while higher prices generally reduced
engagement probability.


In [1]:
import pandas as pd
df = pd.read_csv("../data/depop_vintage_womens_tees.csv")
df.head()


Unnamed: 0,listing_title,price,category,brand,condition,likes,days_since_posted
0,The Mountain Women's White and Blue T-shirt,11.0,vintage_womens_tee,The Mountain,Excellent,30,1.0
1,Urban Outfitters Women's Green T-shirt,6.0,vintage_womens_tee,Urban Outfitters,Excellent,3,2.0
2,American Vintage Women's Cream and Pink T-shirt,15.0,vintage_womens_tee,Unknown,Good,10,1.0
3,Nike Women's Grey T-shirt,25.0,vintage_womens_tee,Nike,Good,21,4.0
4,American Vintage Women's Yellow and Red T-shirt,16.0,vintage_womens_tee,Unknown,Excellent,19,1.0


In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/depop_vintage_womens_tees.csv")

print("rows, cols:", df.shape)
df.head(10)


rows, cols: (199, 7)


Unnamed: 0,listing_title,price,category,brand,condition,likes,days_since_posted
0,The Mountain Women's White and Blue T-shirt,11.0,vintage_womens_tee,The Mountain,Excellent,30,1.0
1,Urban Outfitters Women's Green T-shirt,6.0,vintage_womens_tee,Urban Outfitters,Excellent,3,2.0
2,American Vintage Women's Cream and Pink T-shirt,15.0,vintage_womens_tee,Unknown,Good,10,1.0
3,Nike Women's Grey T-shirt,25.0,vintage_womens_tee,Nike,Good,21,4.0
4,American Vintage Women's Yellow and Red T-shirt,16.0,vintage_womens_tee,Unknown,Excellent,19,1.0
5,L.L.Bean Women's multi T-shirt,15.0,vintage_womens_tee,L. L. Bean,Excellent,47,4.0
6,Nike Women's Red T-shirt,12.0,vintage_womens_tee,Nike,Excellent,4,3.0
7,Brandy Melville Women's Pink T-shirt,15.75,vintage_womens_tee,Brandy Melville,Good,55,2.0
8,Women's Red T-shirt,6.0,vintage_womens_tee,Unknown,Good,15,1.0
9,Victoria's Secret Women's Blue and Navy Shirt,11.75,vintage_womens_tee,Victoria's Secret,Excellent,26,3.0


In [3]:
summary = pd.DataFrame({
    "dtype": df.dtypes.astype(str),
    "missing": df.isna().sum(),
    "missing_pct": (df.isna().mean() * 100).round(2),
    "n_unique": df.nunique(dropna=True)
}).sort_values("missing_pct", ascending=False)

summary.head(25)


Unnamed: 0,dtype,missing,missing_pct,n_unique
listing_title,object,10,5.03,163
price,float64,10,5.03,68
brand,object,10,5.03,69
likes,object,10,5.03,56
condition,object,10,5.03,5
days_since_posted,float64,10,5.03,31
category,object,0,0.0,1


In [4]:
# show likely engagement-related columns (likes, saves, comments, etc.)
candidates = [c for c in df.columns if any(k in c.lower() for k in ["like", "save", "comment", "engage", "view", "click"])]
candidates


['likes']