In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

Load in the Data

In [50]:
train_df = pd.read_parquet('../train_final_v2.parquet')
val_df = pd.read_parquet('../validation_v2.parquet')

PreProcess the Data

In [51]:
# drop nonnumeric columns
train_df = train_df.drop(columns=['asin', 'title', 'incident_indices', 'component_no', 'avg_review_length_chars'])
val_df = val_df.drop(columns=['asin', 'title', 'incident_indices', 'component_no', 'avg_review_length_chars'])

In [4]:
train_df.columns[-10:]

Index(['embedding_380', 'embedding_381', 'embedding_382', 'embedding_383',
       'mean_sentiment_score', 'mean_complaint_similarity',
       'mean_shipping_similarity', 'max_complaint_similarity',
       'shipping_similarity_at_max_complaint',
       'sentiment_score_at_max_complaint'],
      dtype='object')

In [52]:
# rename embed_{n} as summary_embedding_{n}
train_df.rename(columns={f"embed_{i}": f"summary_embedding_{i}" for i in range(384)}, inplace=True)
val_df.rename(columns={f"embed_{i}": f"summary_embedding_{i}" for i in range(384)}, inplace=True)

# rename embedding_{n} as reviewtext_embedding_{n}
train_df.rename(columns={f"embedding_{i}": f"reviewtext_embedding_{i}" for i in range(384)}, inplace=True)
val_df.rename(columns={f"embedding_{i}": f"reviewtext_embedding_{i}" for i in range(384)}, inplace=True)

In [53]:
# make target variable
y = train_df['match']
train_df = train_df.drop(columns=['match'])

# make target variable on validation set
y_val = val_df['match']
val_df = val_df.drop(columns=['match'])

In [7]:
# non-embedding features
begin_features = train_df.columns[:17].to_list()
end_features = train_df.columns[-6:].to_list()
non_embedding_features = begin_features + end_features

In [8]:
# categorical features
categorical_features = 'category'

EDA on Training Data

In [9]:
# correlation matrix for non-embedding features
train_df[begin_features[1:]+end_features].corr()

Unnamed: 0,missing_price,item_rank,avg_rating,min_rating,num_of_rating,percent_positive,percent_negative,avg_verified_reviewers,min_date,max_date,...,unique_reviewer_count,avg_reviews_per_day,reviews_per_product,avg_review_length_words,mean_sentiment_score,mean_complaint_similarity,mean_shipping_similarity,max_complaint_similarity,shipping_similarity_at_max_complaint,sentiment_score_at_max_complaint
missing_price,1.0,0.396998,-0.07314,-0.083552,-0.07362,-0.066451,0.059721,-0.081448,-0.095101,-0.264436,...,-0.074876,-0.036504,-0.07362,0.081616,-0.058113,0.02249,-0.009731,-0.087292,-0.079638,-0.023271
item_rank,0.396998,1.0,-0.104434,-0.28505,-0.194049,-0.093928,0.087294,-0.152149,-0.141977,-0.610756,...,-0.198323,-0.04764,-0.194049,0.137218,-0.10943,0.055077,0.006224,-0.362423,-0.263053,0.015062
avg_rating,-0.07314,-0.104434,1.0,0.798802,0.01213,0.928174,-0.902021,0.107157,0.030263,0.065127,...,0.012283,0.028963,0.01213,-0.118264,0.687557,-0.275681,-0.188312,-0.179058,-0.119813,0.51259
min_rating,-0.083552,-0.28505,0.798802,1.0,0.072745,0.740905,-0.721813,0.053863,-0.074327,0.185872,...,0.074476,0.026985,0.072745,-0.074848,0.551427,-0.208528,-0.165705,0.082436,0.034336,0.328276
num_of_rating,-0.07362,-0.194049,0.01213,0.072745,1.0,0.011818,-0.011596,0.00288,-0.124462,0.1644,...,0.984984,0.009908,1.0,0.004978,0.015685,-0.000582,-0.025495,0.306453,0.156577,-0.073252
percent_positive,-0.066451,-0.093928,0.928174,0.740905,0.011818,1.0,-0.801507,0.090413,0.017897,0.052664,...,0.011949,0.027577,0.011818,-0.106405,0.643982,-0.256183,-0.177249,-0.16495,-0.111181,0.477656
percent_negative,0.059721,0.087294,-0.902021,-0.721813,-0.011596,-0.801507,1.0,-0.088357,0.002478,-0.031875,...,-0.011665,-0.024318,-0.011596,0.067935,-0.648026,0.257954,0.197521,0.167599,0.127898,-0.485259
avg_verified_reviewers,-0.081448,-0.152149,0.107157,0.053863,0.00288,0.090413,-0.088357,1.0,0.328749,0.287161,...,0.00311,0.033797,0.00288,-0.336538,0.111373,-0.054559,0.115028,-0.045131,0.080518,0.084125
min_date,-0.095101,-0.141977,0.030263,-0.074327,-0.124462,0.017897,0.002478,0.328749,1.0,0.51708,...,-0.120588,0.007576,-0.124462,-0.294593,0.077495,-0.034053,0.129742,-0.191294,-0.02115,0.088985
max_date,-0.264436,-0.610756,0.065127,0.185872,0.1644,0.052664,-0.031875,0.287161,0.51708,1.0,...,0.168201,0.042645,0.1644,-0.25202,0.105689,-0.042916,0.074228,0.275011,0.25037,0.001538


In [10]:
# correlation matrix for all features
train_df[train_df.columns[1:]].corr()

Unnamed: 0,missing_price,item_rank,avg_rating,min_rating,num_of_rating,percent_positive,percent_negative,avg_verified_reviewers,min_date,max_date,...,reviewtext_embedding_380,reviewtext_embedding_381,reviewtext_embedding_382,reviewtext_embedding_383,mean_sentiment_score,mean_complaint_similarity,mean_shipping_similarity,max_complaint_similarity,shipping_similarity_at_max_complaint,sentiment_score_at_max_complaint
missing_price,1.000000,0.396998,-0.073140,-0.083552,-0.073620,-0.066451,0.059721,-0.081448,-0.095101,-0.264436,...,-0.036883,0.026650,0.049814,0.037874,-0.058113,0.022490,-0.009731,-0.087292,-0.079638,-0.023271
item_rank,0.396998,1.000000,-0.104434,-0.285050,-0.194049,-0.093928,0.087294,-0.152149,-0.141977,-0.610756,...,-0.029441,-0.040925,-0.000166,-0.007145,-0.109430,0.055077,0.006224,-0.362423,-0.263053,0.015062
avg_rating,-0.073140,-0.104434,1.000000,0.798802,0.012130,0.928174,-0.902021,0.107157,0.030263,0.065127,...,0.163365,0.120406,-0.088510,-0.053543,0.687557,-0.275681,-0.188312,-0.179058,-0.119813,0.512590
min_rating,-0.083552,-0.285050,0.798802,1.000000,0.072745,0.740905,-0.721813,0.053863,-0.074327,0.185872,...,0.104893,0.091108,-0.052749,-0.015424,0.551427,-0.208528,-0.165705,0.082436,0.034336,0.328276
num_of_rating,-0.073620,-0.194049,0.012130,0.072745,1.000000,0.011818,-0.011596,0.002880,-0.124462,0.164400,...,-0.018178,0.009791,0.025425,0.037029,0.015685,-0.000582,-0.025495,0.306453,0.156577,-0.073252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mean_complaint_similarity,0.022490,0.055077,-0.275681,-0.208528,-0.000582,-0.256183,0.257954,-0.054559,-0.034053,-0.042916,...,-0.141228,-0.278807,0.173835,0.307109,-0.220979,1.000000,0.596503,0.651030,0.401651,-0.186174
mean_shipping_similarity,-0.009731,0.006224,-0.188312,-0.165705,-0.025495,-0.177249,0.197521,0.115028,0.129742,0.074228,...,-0.141759,-0.089801,0.059298,0.208330,-0.119002,0.596503,1.000000,0.356591,0.685962,-0.094425
max_complaint_similarity,-0.087292,-0.362423,-0.179058,0.082436,0.306453,-0.164950,0.167599,-0.045131,-0.191294,0.275011,...,-0.124370,-0.195989,0.153375,0.237633,-0.132655,0.651030,0.356591,1.000000,0.606069,-0.255365
shipping_similarity_at_max_complaint,-0.079638,-0.263053,-0.119813,0.034336,0.156577,-0.111181,0.127898,0.080518,-0.021150,0.250370,...,-0.119150,-0.079572,0.054777,0.159591,-0.067449,0.401651,0.685962,0.606069,1.000000,-0.145047


In [11]:
# filter the correlation matrix to see which features have higher than |0.5| correlation
feature_corr_matrix = train_df[train_df.columns[1:]].corr()

mask = np.triu(np.ones_like(feature_corr_matrix, dtype=bool))

high_corr = feature_corr_matrix.where(~mask).stack().reset_index()
high_corr.columns = ['Feature_1', 'Feature_2', 'Correlation']

pos_corr_result = (high_corr[high_corr['Correlation'] >= 0.5])

pos_corr_result = pos_corr_result.sort_values(by='Correlation', ascending=False)

# Show Results of the filter
pos_corr_result

Unnamed: 0,Feature_1,Feature_2,Correlation
95,reviews_per_product,num_of_rating,1.000000
70,unique_reviewer_count,num_of_rating,0.984984
103,reviews_per_product,unique_reviewer_count,0.984984
12,percent_positive,avg_rating,0.928174
5,min_rating,avg_rating,0.798802
...,...,...,...
27099,summary_embedding_217,summary_embedding_55,0.500853
74490,summary_embedding_370,summary_embedding_169,0.500726
44367,summary_embedding_282,summary_embedding_98,0.500573
67578,summary_embedding_352,summary_embedding_34,0.500145


In [12]:
# top 10 positively correlated features
# need to drop reviews_per_product or num_of_rating
pos_corr_result.head(10)

Unnamed: 0,Feature_1,Feature_2,Correlation
95,reviews_per_product,num_of_rating,1.0
70,unique_reviewer_count,num_of_rating,0.984984
103,reviews_per_product,unique_reviewer_count,0.984984
12,percent_positive,avg_rating,0.928174
5,min_rating,avg_rating,0.798802
6390,summary_embedding_97,summary_embedding_46,0.751139
13,percent_positive,min_rating,0.740905
311650,sentiment_score_at_max_complaint,mean_sentiment_score,0.722845
6378,summary_embedding_97,summary_embedding_34,0.713454
1941,summary_embedding_46,summary_embedding_34,0.701901


In [13]:
# negative correlation between features
neg_corr_result = (high_corr[high_corr['Correlation'] <= -0.5])

neg_corr_result = neg_corr_result.sort_values(by='Correlation', ascending=True)

# Show Results of the filter
neg_corr_result

Unnamed: 0,Feature_1,Feature_2,Correlation
17,percent_negative,avg_rating,-0.902021
194280,reviewtext_embedding_223,reviewtext_embedding_127,-0.897861
28584,summary_embedding_223,summary_embedding_127,-0.838924
20,percent_negative,percent_positive,-0.801507
18,percent_negative,min_rating,-0.721813
...,...,...,...
1147,summary_embedding_32,summary_embedding_3,-0.500507
63018,summary_embedding_339,summary_embedding_167,-0.500483
41347,summary_embedding_272,summary_embedding_3,-0.500206
41091,summary_embedding_271,summary_embedding_34,-0.500143


In [14]:
# top 10 negatively correlated features
neg_corr_result.head(10)

Unnamed: 0,Feature_1,Feature_2,Correlation
17,percent_negative,avg_rating,-0.902021
194280,reviewtext_embedding_223,reviewtext_embedding_127,-0.897861
28584,summary_embedding_223,summary_embedding_127,-0.838924
20,percent_negative,percent_positive,-0.801507
18,percent_negative,min_rating,-0.721813
70187,summary_embedding_359,summary_embedding_46,-0.705972
53037,summary_embedding_310,summary_embedding_46,-0.705865
258136,reviewtext_embedding_319,avg_review_length_words,-0.701248
53088,summary_embedding_310,summary_embedding_97,-0.700853
59109,summary_embedding_328,summary_embedding_97,-0.690237


Make a Model

In [33]:
# initialize logistic regression model without penalty
log_reg = LogisticRegression(penalty=None)

In [55]:
train_df['product_lifespan'] = train_df['product_lifespan'].dt.days
val_df['product_lifespan'] = val_df['product_lifespan'].dt.days
categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = train_df.select_dtypes(include=['number']).columns.tolist()

In [36]:
# Combine with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Full pipeline
model_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('scale', StandardScaler()),
    ('logreg', log_reg)])

In [37]:
model_pipeline.fit(train_df, y.values)

In [57]:
preds = model_pipeline.predict(val_df)

In [58]:
from sklearn.metrics import root_mean_squared_error
root_mean_squared_error(y_val, preds)

0.08540393920399116