In [1]:
import pandas as pd

# Load the new dataset
file_path = 'BankDigital/dataset/Newest_REVIEW_BANK_MOBILE_GOOGLE_PLAY_Update21092024.csv'
df_new = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
# df_new.head()

# Convert 'score' and 'thumbsUpCount' to numeric
df_new['score'] = pd.to_numeric(df_new['score'], errors='coerce')
df_new['thumbsUpCount'] = pd.to_numeric(df_new['thumbsUpCount'], errors='coerce')

# Convert 'at' and 'repliedAt' to datetime
df_new['at'] = pd.to_datetime(df_new['at'], errors='coerce')
df_new['repliedAt'] = pd.to_datetime(df_new['repliedAt'], errors='coerce')

In [2]:
# Drop columns that are not necessary for analysis
df_cleaned_new = df_new.drop(columns=['reviewId', 'userImage'])

# Handle missing values (drop rows where 'content' is NaN as it is critical)
df_cleaned_new = df_cleaned_new.dropna(subset=['content'])

In [3]:
# Display info to check data types and missing values
df_cleaned_new.info()

# Display the first few rows to verify the cleaning
df_cleaned_new.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99920 entries, 0 to 99919
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   userName              99920 non-null  object        
 1   content               99920 non-null  object        
 2   score                 99920 non-null  int64         
 3   thumbsUpCount         99920 non-null  int64         
 4   reviewCreatedVersion  78367 non-null  object        
 5   at                    99920 non-null  datetime64[ns]
 6   replyContent          77397 non-null  object        
 7   repliedAt             77397 non-null  datetime64[ns]
 8   appVersion            78367 non-null  object        
 9   sortOrder             99920 non-null  object        
 10  appId                 99920 non-null  object        
dtypes: datetime64[ns](2), int64(2), object(7)
memory usage: 8.4+ MB


Unnamed: 0,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion,sortOrder,appId
0,Nickyta Np,Sangat mengecewakan . Cs sangat lambat Untuk k...,1,219,1.42.34,2024-07-22 02:44:38,"Hai Sahabat Allo, mohon maaf untuk ketidaknyam...",2024-07-22 09:48:19,1.42.34,most_relevant,com.alloapp.yump
1,Suhardiyanto Ttv,Sebetulnya bagus untuk mempermudah dan kelanca...,1,73,1.42.44,2024-07-30 20:46:15,"Hai Sahabat Allo, mohon maaf untuk ketidaknyam...",2024-07-31 11:52:45,1.42.44,most_relevant,com.alloapp.yump
2,Luis Rahim,Aplikasi ribet. Terlalu Banyak iming iming yg ...,1,2,1.43.31,2024-09-09 18:47:56,"Hai Sahabat Allo, mohon maaf untuk ketidaknyam...",2024-09-10 09:53:38,1.43.31,most_relevant,com.alloapp.yump
3,Orang Tua,"Selalu menawarkan paylater, setelah di gunakan...",1,0,1.43.22,2024-09-15 02:29:28,"Hai Sahabat Allo, mohon maaf untuk ketidaknyam...",2024-09-15 09:51:54,1.43.22,most_relevant,com.alloapp.yump
4,alexander fernandes,"ulasan terbaru yah, saya coba mengganti alamat...",1,5,1.43.31,2024-09-06 20:06:21,Hi Allo Friends 👋🏻 Kami menyesal karena kamu m...,2024-09-07 10:14:32,1.43.31,most_relevant,com.alloapp.yump


In [4]:
# Replace missing values in the 'repliedAt' column with an empty string
df_cleaned_new['repliedAt'] = df_cleaned_new['repliedAt'].fillna('')

# Verify the changes
missing_values_after = df_cleaned_new.isnull().sum()


In [6]:
missing_values_after

userName                    0
content                     0
score                       0
thumbsUpCount               0
reviewCreatedVersion    21553
at                          0
replyContent            22523
repliedAt               22523
appVersion              21553
sortOrder                   0
appId                       0
dtype: int64

In [7]:
# Check the missing values in each column
missing_values = df_cleaned_new.isnull().sum()
missing_values_percentage = (df_cleaned_new.isnull().sum() / len(df_cleaned_new)) * 100

missing_values_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_values_percentage
})
missing_values_df

Unnamed: 0,Missing Values,Percentage
userName,0,0.0
content,0,0.0
score,0,0.0
thumbsUpCount,0,0.0
reviewCreatedVersion,21553,21.570256
at,0,0.0
replyContent,22523,22.541033
repliedAt,22523,22.541033
appVersion,21553,21.570256
sortOrder,0,0.0
