In [5]:
import pandas as pd

# Load the new dataset
file_path = 'dataset/REVIEW_BANK_MOBILE_GOOGLE_PLAY16052024.csv'
df_new = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
# df_new.head()

# Convert 'score' and 'thumbsUpCount' to numeric
df_new['score'] = pd.to_numeric(df_new['score'], errors='coerce')
df_new['thumbsUpCount'] = pd.to_numeric(df_new['thumbsUpCount'], errors='coerce')

# Convert 'at' and 'repliedAt' to datetime
df_new['at'] = pd.to_datetime(df_new['at'], errors='coerce')
df_new['repliedAt'] = pd.to_datetime(df_new['repliedAt'], errors='coerce')

In [7]:
# Drop columns that are not necessary for analysis
df_cleaned_new = df_new.drop(columns=['reviewId', 'userImage'])

# Handle missing values (drop rows where 'content' is NaN as it is critical)
df_cleaned_new = df_cleaned_new.dropna(subset=['content'])

In [10]:
# Display info to check data types and missing values
df_cleaned_new.info()

# Display the first few rows to verify the cleaning
df_cleaned_new.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10776 entries, 0 to 10775
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   userName              10776 non-null  object        
 1   content               10776 non-null  object        
 2   score                 10776 non-null  int64         
 3   thumbsUpCount         10776 non-null  int64         
 4   reviewCreatedVersion  9679 non-null   object        
 5   at                    10776 non-null  datetime64[ns]
 6   replyContent          7338 non-null   object        
 7   repliedAt             7338 non-null   datetime64[ns]
 8   appVersion            9679 non-null   object        
 9   sortOrder             10776 non-null  object        
 10  appId                 10776 non-null  object        
dtypes: datetime64[ns](2), int64(2), object(7)
memory usage: 926.2+ KB


Unnamed: 0,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sortOrder,appId
0,Ocha Lambana,Twice installing and uninstalling. Error all t...,1,127,1.3.7.91,2022-11-25 12:10:12,"Hai Sahabat Allo, mohon maaf untuk ketidaknyam...",2022-11-28 01:39:10,1.3.7.91,most_relevant,com.alloapp.yump
1,Anggie Angela Suhaeri,Very poor UI/UX. Took several times to scan ID...,1,300,1.3.7.10,2022-05-18 00:29:35,Mohon maaf atas ketidaknyamanannya. Saat ini a...,2022-06-09 07:44:49,1.3.7.10,most_relevant,com.alloapp.yump
2,Aulia Rahma Qonita,i hope you're working on it cause this app is ...,1,57,1.3.7.97,2023-02-23 15:44:30,"Hai Sahabat Allo, mohon maaf untuk ketidaknyam...",2023-02-24 02:18:28,1.3.7.97,most_relevant,com.alloapp.yump
3,Prastowo Judana,I installed this apps because got informed tha...,1,3,1.40.05,2024-03-25 13:58:45,Hi Allo Friends 👋🏻 Kami menyesal karena kamu m...,2024-03-26 09:33:15,1.40.05,most_relevant,com.alloapp.yump
4,Yelli yoselino,"This is a very stupid app, when login with my ...",1,0,1.40.25,2024-05-02 00:45:13,Hi Allo Friends 👋🏻Kami menyesal karena kamu me...,2024-05-02 04:30:56,1.40.25,most_relevant,com.alloapp.yump


In [11]:
# Replace missing values in the 'repliedAt' column with an empty string
df_cleaned_new['repliedAt'] = df_cleaned_new['repliedAt'].fillna('')

# Verify the changes
missing_values_after = df_cleaned_new.isnull().sum()


In [12]:
missing_values_after

userName                   0
content                    0
score                      0
thumbsUpCount              0
reviewCreatedVersion    1097
at                         0
replyContent            3438
repliedAt               3438
appVersion              1097
sortOrder                  0
appId                      0
dtype: int64

In [14]:
# Check the missing values in each column
missing_values = df_cleaned_new.isnull().sum()
missing_values_percentage = (df_cleaned_new.isnull().sum() / len(df_cleaned_new)) * 100

missing_values_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_values_percentage
})
missing_values_df

Unnamed: 0,Missing Values,Percentage
userName,0,0.0
content,0,0.0
score,0,0.0
thumbsUpCount,0,0.0
reviewCreatedVersion,1097,10.18003
at,0,0.0
replyContent,3438,31.904232
repliedAt,3438,31.904232
appVersion,1097,10.18003
sortOrder,0,0.0
