In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/McDonald_s_Reviews.csv', encoding='latin-1', index_col='reviewer_id')
df = df.loc[df['review'].str.contains(r'[^\x00-\x7F]+') == False]

dis_address = pd.read_csv('data/disBERT_sentiment_by_address.csv', encoding='latin-1')

In [7]:
grouped_df = df.groupby('store_address')
df.columns

Index(['store_name', 'category', 'store_address', 'latitude ', 'longitude',
       'rating_count', 'review_time', 'review', 'rating'],
      dtype='object')

In [31]:
def get_mode(series, default_value=None):
    if not series.empty:
        return series.value_counts().index[0]
    else:
        return default_value

In [34]:
merged_df = dis_address.merge(df[['store_address', 'latitude ', 'longitude', 'review']], on='store_address', how='left')
print(merged_df.columns)
# Group by 'store_address' and aggregate the 'review' column using the join function
merged_df = merged_df.groupby('store_address').agg({
    'latitude ': 'mean',
    'longitude': 'mean',
    'positive': 'mean',
    'negative': 'mean',
    'neutral': 'mean',
    'rating': 'mean',
    'review': lambda x: ' '.join(str(val) for val in x)
}).reset_index()

# The 'merged_df' DataFrame will now contain all the desired columns from both 'df' and 'dis_address'
merged_df

Index(['store_address', 'positive', 'negative', 'neutral', 'rating',
       'latitude ', 'longitude', 'review'],
      dtype='object')


Unnamed: 0,store_address,latitude,longitude,positive,negative,neutral,rating,review
0,"1044 US-11, Champlain, NY 12919, United States",44.98141,-73.45982,0.364466,0.222201,0.413333,3.256055,Stopped in for a quick visit at this newly bui...
1,"10451 Santa Monica Blvd, Los Angeles, CA 90025...",34.056593,-118.426,0.409715,0.290561,0.299724,3.0,I asked for a refund for poorly made oatmeal. ...
2,"10901 Riverside Dr, North Hollywood, CA 91602,...",34.152507,-118.367904,0.37034,0.252946,0.376715,3.450893,"Looks like chicken nuggets bite piece served, ..."
3,"1100 N US Hwy 377, Roanoke, TX 76262, United S...",33.009318,-97.222925,0.480093,0.277782,0.242125,3.528933,"Yesterday (2/3/2023), I stopped at Roanoke McD..."
4,"111 Madison St, Oak Park, IL 60302, United States",41.879656,-87.777913,0.439468,0.226855,0.333676,3.414184,"Just to preface, I'm not a McDonald's regular ..."
5,"1121 Garnet Ave, San Diego, CA 92109, United S...",32.797661,-117.24947,0.368962,0.27422,0.356819,2.8688,I found this McDonald's on Garnet Ave. in PB. ...
6,"11382 US-441, Orlando, FL 32837, United States",28.399986,-81.405103,0.377427,0.241997,0.380576,3.212733,Visit this place as it is close to my house. E...
7,"114 Delancey St, New York, NY 10002, United St...",40.718838,-73.98828,0.059112,0.052097,0.888791,3.333333,Pretty decent McDonalds to get decently priced...
8,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,0.428812,0.286506,0.284682,3.304762,Why does it look like someone spit on my food?...
9,"1415 E State Rd, Fern Park, FL 32730, United S...",28.65535,-81.342692,0.433503,0.243376,0.323121,3.514668,Not only was the speed of service poor I waite...


In [36]:
# drop the rows with missing values and reset the index
merged_df = merged_df.dropna().reset_index(drop=True)
merged_df

Unnamed: 0,store_address,latitude,longitude,positive,negative,neutral,rating,review
0,"1044 US-11, Champlain, NY 12919, United States",44.98141,-73.45982,0.364466,0.222201,0.413333,3.256055,Stopped in for a quick visit at this newly bui...
1,"10451 Santa Monica Blvd, Los Angeles, CA 90025...",34.056593,-118.426,0.409715,0.290561,0.299724,3.0,I asked for a refund for poorly made oatmeal. ...
2,"10901 Riverside Dr, North Hollywood, CA 91602,...",34.152507,-118.367904,0.37034,0.252946,0.376715,3.450893,"Looks like chicken nuggets bite piece served, ..."
3,"1100 N US Hwy 377, Roanoke, TX 76262, United S...",33.009318,-97.222925,0.480093,0.277782,0.242125,3.528933,"Yesterday (2/3/2023), I stopped at Roanoke McD..."
4,"111 Madison St, Oak Park, IL 60302, United States",41.879656,-87.777913,0.439468,0.226855,0.333676,3.414184,"Just to preface, I'm not a McDonald's regular ..."
5,"1121 Garnet Ave, San Diego, CA 92109, United S...",32.797661,-117.24947,0.368962,0.27422,0.356819,2.8688,I found this McDonald's on Garnet Ave. in PB. ...
6,"11382 US-441, Orlando, FL 32837, United States",28.399986,-81.405103,0.377427,0.241997,0.380576,3.212733,Visit this place as it is close to my house. E...
7,"114 Delancey St, New York, NY 10002, United St...",40.718838,-73.98828,0.059112,0.052097,0.888791,3.333333,Pretty decent McDonalds to get decently priced...
8,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,0.428812,0.286506,0.284682,3.304762,Why does it look like someone spit on my food?...
9,"1415 E State Rd, Fern Park, FL 32730, United S...",28.65535,-81.342692,0.433503,0.243376,0.323121,3.514668,Not only was the speed of service poor I waite...


In [38]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
def remove_stopword(x):
    return [y for y in x if y not in stopwords.words('english')]
merged_df['review'] = merged_df['review'].apply(lambda x:remove_stopword(x))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91909\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
merged_df

Unnamed: 0,store_address,latitude,longitude,positive,negative,neutral,rating,review
0,"1044 US-11, Champlain, NY 12919, United States",44.98141,-73.45982,0.364466,0.222201,0.413333,3.256055,"[S, p, p, e, , n, , f, r, , , q, u, c, k, ..."
1,"10451 Santa Monica Blvd, Los Angeles, CA 90025...",34.056593,-118.426,0.409715,0.290561,0.299724,3.0,"[I, , k, e, , f, r, , , r, e, f, u, n, , ..."
2,"10901 Riverside Dr, North Hollywood, CA 91602,...",34.152507,-118.367904,0.37034,0.252946,0.376715,3.450893,"[L, k, , l, k, e, , c, h, c, k, e, n, , n, ..."
3,"1100 N US Hwy 377, Roanoke, TX 76262, United S...",33.009318,-97.222925,0.480093,0.277782,0.242125,3.528933,"[Y, e, e, r, , (, 2, /, 3, /, 2, 0, 2, 3, ), ..."
4,"111 Madison St, Oak Park, IL 60302, United States",41.879656,-87.777913,0.439468,0.226855,0.333676,3.414184,"[J, u, , , p, r, e, f, c, e, ,, , I, ', , ..."
5,"1121 Garnet Ave, San Diego, CA 92109, United S...",32.797661,-117.24947,0.368962,0.27422,0.356819,2.8688,"[I, , f, u, n, , h, , M, c, D, n, l, ', , ..."
6,"11382 US-441, Orlando, FL 32837, United States",28.399986,-81.405103,0.377427,0.241997,0.380576,3.212733,"[V, , h, , p, l, c, e, , , , , c, l, e, ..."
7,"114 Delancey St, New York, NY 10002, United St...",40.718838,-73.98828,0.059112,0.052097,0.888791,3.333333,"[P, r, e, , e, c, e, n, , M, c, D, n, l, , ..."
8,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,0.428812,0.286506,0.284682,3.304762,"[W, h, , e, , , l, k, , l, k, e, , e, n, ..."
9,"1415 E State Rd, Fern Park, FL 32730, United S...",28.65535,-81.342692,0.433503,0.243376,0.323121,3.514668,"[N, , n, l, , w, , h, e, , p, e, e, , f, ..."


: 

In [39]:
# export the DataFrame to a csv file
merged_df.to_csv('data/merged_df.csv', index=False)