In [6]:
import pandas as pd
from datetime import datetime

# Load the data
df_review = pd.read_csv('df_review_processed.csv')

# Convert timestamp to datetime
df_review['review_date'] = pd.to_datetime(df_review['timestamp'], unit='ms')

# Display the first few rows of the DataFrame
df_review.head()


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,product_id,rating,text_len,title_len,month,reviewer_expe,cgi_dummy,after_dummy,...,TextLen,TitleLen,CumulativeTextLen,CumulativeTitleLen,fgi_images,features,first_cgi_date,datetime,mon,year
0,0,0,B01IIS2A3K,4.0,15,2,77,0,0,0,...,15,2,45.928571,5.5,['https://m.media-amazon.com/images/I/41k+Fit7...,"['95% Rayon, 5% Spandex', '&check; Made in USA...",2018-10-15 07:25:55.627,12Jan2016 00:47:06,1,2016
1,1,1,B01IIS2A3K,4.0,47,12,82,0,0,0,...,47,12,15.0,2.0,['https://m.media-amazon.com/images/I/41k+Fit7...,"['95% Rayon, 5% Spandex', '&check; Made in USA...",2018-10-15 07:25:55.627,10Jun2016 21:51:10,6,2016
2,2,2,B01IIS2A3K,5.0,17,2,82,0,0,0,...,17,2,31.0,7.0,['https://m.media-amazon.com/images/I/41k+Fit7...,"['95% Rayon, 5% Spandex', '&check; Made in USA...",2018-10-15 07:25:55.627,15Jun2016 02:08:57,6,2016
3,3,3,B01IIS2A3K,5.0,12,3,82,0,0,0,...,12,3,26.333333,5.333333,['https://m.media-amazon.com/images/I/41k+Fit7...,"['95% Rayon, 5% Spandex', '&check; Made in USA...",2018-10-15 07:25:55.627,29Jun2016 00:37:39,6,2016
4,4,4,B01IIS2A3K,4.0,57,5,85,0,0,0,...,57,5,22.75,4.75,['https://m.media-amazon.com/images/I/41k+Fit7...,"['95% Rayon, 5% Spandex', '&check; Made in USA...",2018-10-15 07:25:55.627,08Sep2016 03:44:45,9,2016


In [7]:
# Assuming `first_cgi_date` is available in the DataFrame `df_product`
df_product = df_review.groupby('product_id').agg({
    'review_date': 'min',
    'first_cgi_date': 'first'
}).reset_index()

# Convert `first_cgi_date` to datetime
df_product['first_cgi_date'] = pd.to_datetime(df_product['first_cgi_date'])

# Define early and late treated products
df_product['early_treated'] = (df_product['first_cgi_date'].dt.month == df_product['first_cgi_date'].dt.month.min()).astype(int)
df_product['late_treated'] = (df_product['first_cgi_date'].dt.month > df_product['first_cgi_date'].dt.month.min()).astype(int)

# Merge back to the review DataFrame
df_review = df_review.merge(df_product[['product_id', 'first_cgi_date', 'early_treated', 'late_treated']], on='product_id', how='left')

# Ensure we use the correct column names after the merge
df_review.rename(columns={'first_cgi_date_y': 'first_cgi_date'}, inplace=True)

# Calculate the difference in months between the review date and the first CGI date
df_review['months_since_first_cgi'] = ((df_review['review_date'] - df_review['first_cgi_date']) / pd.Timedelta(days=30)).astype(int)

# Exclude reviews in the first month
df_review = df_review[df_review['months_since_first_cgi'] > 0]

# Create an adjusted treatment variable
df_review['adjusted_treatment'] = df_review['late_treated']

# Check the first few rows
df_review.head()


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,product_id,rating,text_len,title_len,month,reviewer_expe,cgi_dummy,after_dummy,...,features,first_cgi_date_x,datetime,mon,year,first_cgi_date,early_treated,late_treated,months_since_first_cgi,adjusted_treatment
48,48,48,B071HJ5L7P,3.0,74,3,148,0,0,1,...,"['90% Viscose, 10% Spandex', 'Imported', 'Pull...",2021-09-17 23:43:31.161,20Dec2021 22:17:16,12,2021,2021-09-17 23:43:31.161,0,1,3,1
49,49,49,B071HJ5L7P,5.0,5,3,150,0,0,1,...,"['90% Viscose, 10% Spandex', 'Imported', 'Pull...",2021-09-17 23:43:31.161,21Feb2022 12:56:32,2,2022,2021-09-17 23:43:31.161,0,1,5,1
50,50,50,B071HJ5L7P,3.0,7,4,154,0,0,1,...,"['90% Viscose, 10% Spandex', 'Imported', 'Pull...",2021-09-17 23:43:31.161,20Jun2022 07:54:40,6,2022,2021-09-17 23:43:31.161,0,1,9,1
51,51,51,B071HJ5L7P,5.0,33,3,154,0,0,1,...,"['90% Viscose, 10% Spandex', 'Imported', 'Pull...",2021-09-17 23:43:31.161,24Jun2022 07:26:30,6,2022,2021-09-17 23:43:31.161,0,1,9,1
52,52,52,B071HJ5L7P,5.0,4,2,156,0,0,1,...,"['90% Viscose, 10% Spandex', 'Imported', 'Pull...",2021-09-17 23:43:31.161,12Aug2022 18:50:28,8,2022,2021-09-17 23:43:31.161,0,1,10,1


In [8]:
df_review.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'product_id', 'rating', 'text_len',
       'title_len', 'month', 'reviewer_expe', 'cgi_dummy', 'after_dummy',
       'timestamp', 'review_text', 'cgi_images', 'helpful_vote', 'treat_dummy',
       'Volume', 'Valence', 'Variance', 'AfterTreat', 'review_date', 'TextLen',
       'TitleLen', 'CumulativeTextLen', 'CumulativeTitleLen', 'fgi_images',
       'features', 'first_cgi_date_x', 'datetime', 'mon', 'year',
       'first_cgi_date', 'early_treated', 'late_treated',
       'months_since_first_cgi', 'adjusted_treatment'],
      dtype='object')

In [11]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Create a dummy variable for post-treatment period
df_review['post_cgi'] = (df_review['review_date'] > df_review['first_cgi_date']).astype(int)

# Create interaction term for DiD
df_review['treatment_post'] = df_review['adjusted_treatment'] * df_review['post_cgi']

# Define the formula for the linear regression model with interaction term
formula = 'rating ~ adjusted_treatment + post_cgi + treatment_post + Volume + Valence + Variance + CumulativeTextLen + CumulativeTitleLen + reviewer_expe + C(mon)'

# Fit the model
model = smf.ols(formula, data=df_review)
result = model.fit()

# Print the summary of the model
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.040
Model:                            OLS   Adj. R-squared:                  0.031
Method:                 Least Squares   F-statistic:                     4.568
Date:                Wed, 12 Jun 2024   Prob (F-statistic):           5.95e-10
Time:                        01:36:30   Log-Likelihood:                -3235.0
No. Observations:                2018   AIC:                             6508.
Df Residuals:                    1999   BIC:                             6615.
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept              0.8608      0