In [25]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    average_precision_score, precision_score, recall_score, f1_score
)

Load in the Data

In [26]:
train_df = pd.read_parquet('../train_final_v3.parquet')
val_df = pd.read_parquet('../validationA_v3.parquet')
cal_df = pd.read_parquet('../validationB_v3.parquet')

PreProcess the Data

In [27]:
# change name of embedding columns
dfs = [train_df, val_df, cal_df]

for df in dfs:
    # rename embed_{n} as summary_embedding_{n}
    df.rename(columns={f"embed_{i}": f"summary_embedding_{i}" for i in range(384)}, inplace=True)
    # rename embedding_{n} as reviewtext_embedding_{n}
    df.rename(columns={f"embedding_{i}": f"reviewtext_embedding_{i}" for i in range(384)}, inplace=True)

In [28]:
def extract_targets(dfs, target_column='match'):
    targets = []
    for df in dfs:
        targets.append(df[target_column])
        df.drop(columns=[target_column], inplace=True)
    return targets

In [29]:
# extract target and drop from design matrix
dfs = [train_df, val_df, cal_df]
targets = extract_targets(dfs)

train_y, val_y, cal_y = targets

In [30]:
# non-embedding features
begin_features = train_df.columns[:16].to_list()
end_features = train_df.columns[-6:].to_list()
non_embedding_features = begin_features + end_features

In [31]:
# categorical features
categorical_features = 'category'

# change and check data types 
for df in dfs:
    df['product_lifespan'] = df['product_lifespan'].dt.days
    df['missing_price'] = df['missing_price'].astype(int)
    df.drop(['min_date', 'max_date'], axis=1, inplace=True)

categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = train_df.select_dtypes(include=['number']).columns.tolist()

EDA on Training Data

In [9]:
# correlation matrix for non-embedding features
train_df[begin_features[1:]+end_features].corr()

Unnamed: 0,missing_price,item_rank,avg_rating,min_rating,percent_positive,percent_negative,avg_verified_reviewers,min_date,max_date,product_lifespan,...,unique_reviewer_count,avg_reviews_per_day,reviews_per_product,avg_review_length_words,mean_sentiment_score,mean_complaint_similarity,mean_shipping_similarity,max_complaint_similarity,shipping_similarity_at_max_complaint,sentiment_score_at_max_complaint
missing_price,1.0,0.397232,-0.073722,-0.084188,-0.066885,0.059782,-0.080962,-0.094594,-0.263965,-0.132964,...,-0.076755,-0.035336,-0.075929,0.081868,-0.059014,0.023654,-0.009715,-0.086209,-0.078643,-0.024765
item_rank,0.397232,1.0,-0.103719,-0.284773,-0.093563,0.086678,-0.151444,-0.14203,-0.609545,-0.391541,...,-0.199884,-0.044102,-0.196587,0.136703,-0.109818,0.055039,0.004621,-0.361435,-0.263765,0.01422
avg_rating,-0.073722,-0.103719,1.0,0.797841,0.928199,-0.902056,0.103396,0.029055,0.062871,0.024453,...,0.011853,0.027177,0.011709,-0.116737,0.687027,-0.274461,-0.187014,-0.179334,-0.119191,0.512224
min_rating,-0.084188,-0.284773,0.797841,1.0,0.739617,-0.721084,0.050736,-0.075741,0.18349,0.248919,...,0.075076,0.024598,0.073721,-0.073316,0.551026,-0.207011,-0.163593,0.082534,0.03515,0.32804
percent_positive,-0.066885,-0.093563,0.928199,0.739617,1.0,-0.80211,0.0874,0.017633,0.05116,0.026545,...,0.011486,0.025616,0.01137,-0.105495,0.643896,-0.255409,-0.175981,-0.165793,-0.110991,0.477634
percent_negative,0.059782,0.086678,-0.902056,-0.721084,-0.80211,1.0,-0.084272,0.004132,-0.029223,-0.030873,...,-0.011292,-0.02265,-0.01124,0.066051,-0.646173,0.256231,0.196444,0.167256,0.12728,-0.483966
avg_verified_reviewers,-0.080962,-0.151444,0.103396,0.050736,0.0874,-0.084272,1.0,0.329849,0.287963,-0.105624,...,0.003472,0.033327,0.003216,-0.3386,0.109111,-0.056607,0.114678,-0.046647,0.080529,0.082968
min_date,-0.094594,-0.14203,0.029055,-0.075741,0.017633,0.004132,0.329849,1.0,0.519009,-0.638843,...,-0.116391,0.00744,-0.120776,-0.29538,0.077075,-0.035293,0.128746,-0.191535,-0.020447,0.088981
max_date,-0.263965,-0.609545,0.062871,0.18349,0.05116,-0.029223,0.287963,0.519009,1.0,0.32604,...,0.169746,0.041838,0.166758,-0.253813,0.105516,-0.043318,0.075688,0.27342,0.251189,0.00247
product_lifespan,-0.132964,-0.391541,0.024453,0.248919,0.026545,-0.030873,-0.105624,-0.638843,0.32604,1.0,...,0.281507,0.029428,0.283666,0.098239,0.009726,4.4e-05,-0.074266,0.457925,0.248697,-0.096188


In [10]:
# filter the correlation matrix to see which features have higher than |0.5| correlation
feature_corr_matrix = train_df[train_df.columns[1:]].corr()

mask = np.triu(np.ones_like(feature_corr_matrix, dtype=bool))

high_corr = feature_corr_matrix.where(~mask).stack().reset_index()
high_corr.columns = ['Feature_1', 'Feature_2', 'Correlation']

pos_corr_result = (high_corr[high_corr['Correlation'] >= 0.5])

pos_corr_result = pos_corr_result.sort_values(by='Correlation', ascending=False)

# Show Results of the filter
pos_corr_result

Unnamed: 0,Feature_1,Feature_2,Correlation
89,reviews_per_product,unique_reviewer_count,0.987056
8,percent_positive,avg_rating,0.928199
5,min_rating,avg_rating,0.797841
6277,summary_embedding_97,summary_embedding_46,0.751040
9,percent_positive,min_rating,0.739617
...,...,...,...
55045,summary_embedding_317,summary_embedding_84,0.500751
9640,summary_embedding_124,summary_embedding_34,0.500694
68949,summary_embedding_356,summary_embedding_299,0.500498
67010,summary_embedding_351,summary_embedding_200,0.500167


In [11]:
# top 10 positively correlated features
# need to drop reviews_per_product or num_of_rating
pos_corr_result.head(10)

Unnamed: 0,Feature_1,Feature_2,Correlation
89,reviews_per_product,unique_reviewer_count,0.987056
8,percent_positive,avg_rating,0.928199
5,min_rating,avg_rating,0.797841
6277,summary_embedding_97,summary_embedding_46,0.75104
9,percent_positive,min_rating,0.739617
310861,sentiment_score_at_max_complaint,mean_sentiment_score,0.72286
6265,summary_embedding_97,summary_embedding_34,0.713441
1879,summary_embedding_46,summary_embedding_34,0.702397
42307,summary_embedding_276,summary_embedding_97,0.697054
306155,mean_sentiment_score,avg_rating,0.687027


In [12]:
# negative correlation between features
neg_corr_result = (high_corr[high_corr['Correlation'] <= -0.5])

neg_corr_result = neg_corr_result.sort_values(by='Correlation', ascending=True)

# Show Results of the filter
neg_corr_result

Unnamed: 0,Feature_1,Feature_2,Correlation
12,percent_negative,avg_rating,-0.902056
193657,reviewtext_embedding_223,reviewtext_embedding_127,-0.897805
28345,summary_embedding_223,summary_embedding_127,-0.839951
14,percent_negative,percent_positive,-0.802110
13,percent_negative,min_rating,-0.721084
...,...,...,...
52697,summary_embedding_310,summary_embedding_32,-0.500465
62663,summary_embedding_339,summary_embedding_167,-0.500393
1099,summary_embedding_32,summary_embedding_3,-0.500260
42383,summary_embedding_276,summary_embedding_173,-0.500213


In [13]:
# top 10 negatively correlated features
neg_corr_result.head(10)

Unnamed: 0,Feature_1,Feature_2,Correlation
12,percent_negative,avg_rating,-0.902056
193657,reviewtext_embedding_223,reviewtext_embedding_127,-0.897805
28345,summary_embedding_223,summary_embedding_127,-0.839951
14,percent_negative,percent_positive,-0.80211
13,percent_negative,min_rating,-0.721084
69812,summary_embedding_359,summary_embedding_46,-0.706595
52711,summary_embedding_310,summary_embedding_46,-0.704846
52762,summary_embedding_310,summary_embedding_97,-0.70116
257417,reviewtext_embedding_319,avg_review_length_words,-0.699696
58765,summary_embedding_328,summary_embedding_97,-0.690493


In [32]:
for df in (train_df, val_df):
    df.drop(['percent_positive', 'percent_negative', 'unique_reviewer_count'], axis=1, inplace=True)

Pre-Processing and PCA on Embedding Vectors

Make a Model

In [None]:
# Combine with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Full pipeline
model_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('logreg', LogisticRegression(penalty=None, class_weight='balanced'))])

In [52]:
model_pipeline.fit(train_df, train_y.values)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [53]:
preds = model_pipeline.predict(val_df)
probs = model_pipeline.predict_proba(val_df)[:,1]

In [54]:
# get feature importance from the pipeline log reg
logreg = model_pipeline.named_steps['logreg']

# get preprocessed columns
preprocessor = model_pipeline.named_steps['preprocessing']

# Get names from each transformer
num_features = preprocessor.named_transformers_['num'].get_feature_names_out(numeric_cols)
cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)

# Combine all feature names
all_features = np.concatenate([num_features, cat_features])

# get coefficients for feature importance
coefficients = pd.Series(logreg.coef_[0], index=all_features)
coefficients = coefficients.sort_values(key=np.abs, ascending=False)



In [68]:
coefficients.head(20)

category_Games                          -3.850213
category_Party Supplies                 -2.309550
category_Tricycles, Scooters & Wagons    2.305755
category_Baby & Toddler Toys             1.463390
category_Dolls & Accessories             1.341731
category_Sports & Outdoor Play           1.337283
category_Building Toys                  -1.266354
category_Arts & Crafts                   1.229312
category_Hobbies                        -1.157462
category_Puzzles                        -1.073150
category_Action Figures & Statues       -1.057455
reviewtext_embedding_354                -0.959197
reviewtext_embedding_223                -0.919786
item_rank                               -0.896850
reviewtext_embedding_367                 0.888946
reviewtext_embedding_119                 0.864387
summary_embedding_217                    0.852088
category_Grown-Up Toys                  -0.841802
unique_reviewer_count                    0.810886
reviewtext_embedding_192                 0.801326


In [55]:
def evaluate_model(model_name, y_true, y_pred, y_prob):
    return {
        model_name: {
        #"report": classification_report(y_true, y_pred, output_dict=True),
        "conf_mat": confusion_matrix(y_true, y_pred),
        "prec": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "pr_auc": average_precision_score(y_true, y_prob)
    }}
    
all_results = {}
all_results.update(evaluate_model("LogisticRegression", y_true=val_y, y_pred = preds, y_prob = probs))


In [56]:
all_results

{'LogisticRegression': {'conf_mat': array([[21236,  2764],
         [   84,    80]]),
  'prec': 0.02812939521800281,
  'recall': 0.4878048780487805,
  'f1': 0.05319148936170213,
  'pr_auc': np.float64(0.025077464564341696)}}

Remove some features

In [62]:
selected_features = ['category_games',
                     'item_rank',
                     'unique_reviewer_count',
                     'max_complaint_similarity',
                     'avg_review_length_words'
                     ]

numeric_cols = ['item_rank', 'unique_reviewer_count', 'max_complaint_similarity', 'avg_review_length_words']
categorical_cols = ['category']

In [64]:
# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Define the pipeline
model_pipe_red_feat = Pipeline([
    ('preprocessing', preprocessor),
    ('logreg', LogisticRegression(penalty=None, class_weight='balanced', max_iter=1000))
])

# filter train and validation sets by selected features
train_red_feat_df = train_df[numeric_cols + categorical_cols]
val_red_feat_df = val_df[numeric_cols + categorical_cols]

In [65]:
model_pipe_red_feat.fit(train_red_feat_df, train_y)
pred_red_feat = model_pipe_red_feat.predict(val_red_feat_df)
probs_red_feat = model_pipe_red_feat.predict_proba(val_red_feat_df)[:,1]


In [66]:
all_results.update(evaluate_model("LogisticRegression_red_feat", y_true=val_y,
                                  y_pred = pred_red_feat,
                                  y_prob = probs_red_feat))

In [67]:
all_results

{'LogisticRegression': {'conf_mat': array([[21236,  2764],
         [   84,    80]]),
  'prec': 0.02812939521800281,
  'recall': 0.4878048780487805,
  'f1': 0.05319148936170213,
  'pr_auc': np.float64(0.025077464564341696)},
 'LogisticRegression_red_feat': {'conf_mat': array([[18258,  5742],
         [   40,   124]]),
  'prec': 0.02113876576883737,
  'recall': 0.7560975609756098,
  'f1': 0.041127694859038146,
  'pr_auc': np.float64(0.04225383796728305)}}

In [None]:
from sklearn.decomposition import PCA
PCA()