In [1]:
# Import packages
import pandas as pd
import numpy as np
from google_play_scraper import Sort, reviews_all

In [2]:
# Get all reviews
us_reviews = reviews_all(
    'com.google.android.apps.fitness',
    sleep_milliseconds=0,  # defaults to 0
    lang='en',  # defaults to 'en'
    country='us',  # defaults to 'us'
    sort=Sort.NEWEST,  # defaults to Sort.MOST_RELEVANT
)

In [3]:
# Convert to Pandas DF
df = pd.DataFrame(np.array(us_reviews), columns=['review'])
df = df.join(pd.DataFrame(df.pop('review').tolist()))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66291 entries, 0 to 66290
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              66291 non-null  object        
 1   userName              66291 non-null  object        
 2   userImage             66291 non-null  object        
 3   content               66272 non-null  object        
 4   score                 66291 non-null  int64         
 5   thumbsUpCount         66291 non-null  int64         
 6   reviewCreatedVersion  55902 non-null  object        
 7   at                    66291 non-null  datetime64[ns]
 8   replyContent          13769 non-null  object        
 9   repliedAt             13769 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(2), object(6)
memory usage: 5.1+ MB


In [4]:
# Check distribution of scores across full set of reviews
df.score.value_counts()

1    20914
5    16729
2    10538
3     9915
4     8071
Name: score, dtype: int64

In [4]:
# Select only reviews that contain the words or phrases "interface", "user experience", "UX" or "UI"
df_subset = df[df['content'].str.contains("interface|user experience|UX|UI") == True]

In [5]:
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1383 entries, 89 to 65949
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              1383 non-null   object        
 1   userName              1383 non-null   object        
 2   userImage             1383 non-null   object        
 3   content               1383 non-null   object        
 4   score                 1383 non-null   int64         
 5   thumbsUpCount         1383 non-null   int64         
 6   reviewCreatedVersion  1267 non-null   object        
 7   at                    1383 non-null   datetime64[ns]
 8   replyContent          194 non-null    object        
 9   repliedAt             194 non-null    datetime64[ns]
dtypes: datetime64[ns](2), int64(2), object(6)
memory usage: 118.9+ KB


In [10]:
#Extract year of review into new variable
df_subset['year'] = pd.DatetimeIndex(df_subset['at']).year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['year'] = pd.DatetimeIndex(df_subset['at']).year


In [19]:
#Filter to only reviews from the last year
df_subset_recent = df_subset[df_subset['year'] == 2021]

In [20]:
df_subset_recent.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106 entries, 89 to 11251
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              106 non-null    object        
 1   userName              106 non-null    object        
 2   userImage             106 non-null    object        
 3   content               106 non-null    object        
 4   score                 106 non-null    int64         
 5   thumbsUpCount         106 non-null    int64         
 6   reviewCreatedVersion  100 non-null    object        
 7   at                    106 non-null    datetime64[ns]
 8   replyContent          39 non-null     object        
 9   repliedAt             39 non-null     datetime64[ns]
 10  year                  106 non-null    int64         
dtypes: datetime64[ns](2), int64(3), object(6)
memory usage: 9.9+ KB
