In [16]:
# linear regression
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import numpy as np

This ipynb contains a script that does the following:
- reads in the CSV file `reviews_with_sentiment.csv` containing previously established columns of interest in addition to columns of compound sentiment scores for each aspect of customer experience (i.e., cleanliness, safety, and location)
- casts the columsn related to id into ints for readability when using the `.head()` function
- performs a linear regression, with `overall_sent_compound` as the target variable and `cleanliness_sent_compound`, `location_sent_compound`, `safety_sent_compound` as the explanatory variables
    - first, an overall linear regression is run on the entire dataset to look at the value of the coefficients
    - next, a linear regression is run on each price point group (low, medium, and high) made possible by `get_price_point()` which creates a new column that categorizes each observation into its appropriate price point group based on how its price falls within the quantiles of the overall prices in the entire data set

In [8]:
df = pd.read_csv("../DATA/reviews_with_sentiment.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12666 entries, 0 to 12665
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   listing_id                 12666 non-null  float64
 1   review_id                  12666 non-null  float64
 2   comments                   12666 non-null  object 
 3   price                      12666 non-null  int64  
 4   city                       12666 non-null  object 
 5   cleanliness                12666 non-null  int64  
 6   safety                     12666 non-null  int64  
 7   location                   12666 non-null  int64  
 8   cleanliness_sent_compound  3843 non-null   float64
 9   location_sent_compound     2996 non-null   float64
 10  safety_sent_compound       719 non-null    float64
 11  overall_sent_compound      12666 non-null  float64
dtypes: float64(6), int64(4), object(2)
memory usage: 1.2+ MB


In [13]:
# cast type of id cols to be ints instead of floats to get rid of scientific notation
df["listing_id"] = df["listing_id"].astype("int64")
df["review_id"] = df["review_id"].astype("int64")

In [14]:
df.head()

Unnamed: 0,listing_id,review_id,comments,price,city,cleanliness,safety,location,cleanliness_sent_compound,location_sent_compound,safety_sent_compound,overall_sent_compound
0,984218460168529664,1098675989379762048,Perfect home!,244,Austin,0,0,0,,,,0.6114
1,1141608326810219776,1205220528192611072,This was a major win...was in Vegas for a week...,36,Clark_co,1,0,1,0.7783,0.0,,0.9833
2,45301708,1083518538803351552,This was a perfect place to stay. I had so muc...,64,Clark_co,1,0,0,0.7537,,,0.9836
3,902142120588141952,1263867375191220992,Absolutely loved my stay even if it was last m...,59,Chicago,0,0,0,,,,0.9094
4,917160370919078016,1245033135585708032,"First off, not handicap accessible at all! You...",57,Chicago,0,1,1,,0.0,0.0,0.9076


In [31]:
y = df["overall_sent_compound"]
X = df.loc[:, ["cleanliness_sent_compound", "location_sent_compound", "safety_sent_compound"]]
X_train, X_test, y_train, y_test = train_test_split(X,y, # Feature and target variables
                                                    test_size=.2, # Split the sample 80 train/ 20 test
                                                    random_state=100) # For replication purposes
# going to fill the NaNs with 0's for now
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# train the model
reg = linear_model.LinearRegression(fit_intercept=False).fit(X_train,y_train) # Run regression
y_hat = reg.predict(X_test)
print("Yhat: ", y_hat)
print('Rsq: ', reg.score(X_test,y_test)) # R2
rmse = np.sqrt( np.mean( (y_test - y_hat)**2 ))
print('RMSE: ', rmse) # R2
results = pd.DataFrame({'variable':reg.feature_names_in_, 'coefficient': reg.coef_}) # Regression coefficients
results


Yhat:  [0.         0.49168667 0.73077971 ... 0.22748354 0.         0.        ]
Rsq:  -4.9518711203614965
RMSE:  0.6551529219722169


Unnamed: 0,variable,coefficient
0,cleanliness_sent_compound,1.017232
1,location_sent_compound,0.832663
2,safety_sent_compound,0.570866


Now, we will split the data by each price range to be able to answer our question of which aspect of customer experience impacts the sentiment score of reviews across price ranges.

In [27]:
df["price"].describe()

count    12666.000000
mean       158.195089
std        201.568437
min         10.000000
25%         88.000000
50%        120.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64

In [28]:
# separate price into low, medium, and high price points based on quantiles
# low = < q1
# medium = q1 through q3
# high = > q3
q1 = df["price"].quantile(0.25)
q3 = df["price"].quantile(0.75)
def get_price_point(price):
    if price < q1:
        return "low"
    elif q1 <= price <= q3:
        return "medium"
    else:
        return "high"
    
df["price_point"] = df["price"].apply(get_price_point)
df.head()

Unnamed: 0,listing_id,review_id,comments,price,city,cleanliness,safety,location,cleanliness_sent_compound,location_sent_compound,safety_sent_compound,overall_sent_compound,price_point
0,984218460168529664,1098675989379762048,Perfect home!,244,Austin,0,0,0,,,,0.6114,high
1,1141608326810219776,1205220528192611072,This was a major win...was in Vegas for a week...,36,Clark_co,1,0,1,0.7783,0.0,,0.9833,low
2,45301708,1083518538803351552,This was a perfect place to stay. I had so muc...,64,Clark_co,1,0,0,0.7537,,,0.9836,low
3,902142120588141952,1263867375191220992,Absolutely loved my stay even if it was last m...,59,Chicago,0,0,0,,,,0.9094,low
4,917160370919078016,1245033135585708032,"First off, not handicap accessible at all! You...",57,Chicago,0,1,1,,0.0,0.0,0.9076,low


In [41]:
price_ranges = ["low", "medium", "high"]

# linear regression on each price range
for price in price_ranges:
    print(price)
    # get subset of data for current price range
    price_filter = df["price_point"] == price
    subset_for_price = df[price_filter]

    y = subset_for_price["overall_sent_compound"]
    X = subset_for_price.loc[:, ["cleanliness_sent_compound", "location_sent_compound", "safety_sent_compound"]]
    X_train, X_test, y_train, y_test = train_test_split(X,y, # Feature and target variables
                                                    test_size=.2, # Split the sample 80 train/ 20 test
                                                    random_state=100) # For replication purposes
    # going to fill the NaNs with 0's for now
    X_train = X_train.fillna(0)
    X_test = X_test.fillna(0)

    reg = linear_model.LinearRegression(fit_intercept=False).fit(X_train,y_train) # Run regression
    y_hat = reg.predict(X_test)
    # print("Yhat: ", y_hat)
    print('Rsq: ', reg.score(X_test,y_test)) # R2
    rmse = np.sqrt( np.mean( (y_test - y_hat)**2 ))
    print('RMSE: ', rmse) # R2
    results = pd.DataFrame({'variable':reg.feature_names_in_, 'coefficient': reg.coef_}) # Regression coefficients
    
    print(results)
    print()


low
Rsq:  -4.087030948811404
RMSE:  0.6257014240003221
                    variable  coefficient
0  cleanliness_sent_compound     1.016784
1     location_sent_compound     0.802198
2       safety_sent_compound     0.523185

medium
Rsq:  -4.547941352129233
RMSE:  0.645969499965157
                    variable  coefficient
0  cleanliness_sent_compound     1.005925
1     location_sent_compound     0.826274
2       safety_sent_compound     0.594374

high
Rsq:  -6.018563467795654
RMSE:  0.6889075509347834
                    variable  coefficient
0  cleanliness_sent_compound     1.011170
1     location_sent_compound     0.892225
2       safety_sent_compound     0.532990

