https://huggingface.co/datasets/recmeapp/mobilerec

In [8]:
import pandas as pd
import os

# Define the path to your CSV file
data_path = '../data'  # adjust if needed
file_path = os.path.join(data_path, 'mobilerec_final.csv')

# Load the MobileRec dataset
mobile = pd.read_csv(file_path)

# Show shape and sample
print(f"✅ mobilerec_final.csv shape: {mobile.shape}\n")

print("🔍 Sample rows:")
mobile.head()

✅ mobilerec_final.csv shape: (19297019, 9)

🔍 Sample rows:


Unnamed: 0,app_package,review,rating,votes,date,uid,formated_date,unix_timestamp,app_category
0,com.cleverapps.heroes,It's really a fun game,5,1,"October 21, 2018",shqoc6X1fcJRLEmx,2018-10-21,1540094000.0,Casual
1,com.bodyfast,uninstalling. it was ok but felt like it was c...,2,0,"January 18, 2019",shqoc6X1fcJRLEmx,2019-01-18,1547788000.0,Health & Fitness
2,com.thrivegames.wordshapes,Love this game,4,1,"January 16, 2021",shqoc6X1fcJRLEmx,2021-01-16,1610773000.0,Word
3,com.affinity.rewarded_play,Doesn't update play time. Just downloaded it 3...,1,1,"November 4, 2021",shqoc6X1fcJRLEmx,2021-11-04,1635998000.0,Entertainment
4,dating.inmessage.net,app crashes every time I try to log in. what g...,1,0,"November 24, 2021",shqoc6X1fcJRLEmx,2021-11-24,1637730000.0,Dating


In [9]:
mobile.head()

Unnamed: 0,app_package,review,rating,votes,date,uid,formated_date,unix_timestamp,app_category
0,com.cleverapps.heroes,It's really a fun game,5,1,"October 21, 2018",shqoc6X1fcJRLEmx,2018-10-21,1540094000.0,Casual
1,com.bodyfast,uninstalling. it was ok but felt like it was c...,2,0,"January 18, 2019",shqoc6X1fcJRLEmx,2019-01-18,1547788000.0,Health & Fitness
2,com.thrivegames.wordshapes,Love this game,4,1,"January 16, 2021",shqoc6X1fcJRLEmx,2021-01-16,1610773000.0,Word
3,com.affinity.rewarded_play,Doesn't update play time. Just downloaded it 3...,1,1,"November 4, 2021",shqoc6X1fcJRLEmx,2021-11-04,1635998000.0,Entertainment
4,dating.inmessage.net,app crashes every time I try to log in. what g...,1,0,"November 24, 2021",shqoc6X1fcJRLEmx,2021-11-24,1637730000.0,Dating


In [10]:
mobile.columns

Index(['app_package', 'review', 'rating', 'votes', 'date', 'uid',
       'formated_date', 'unix_timestamp', 'app_category'],
      dtype='object')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Copy mobile dataset
df = mobile.copy()

# ========== User & App Statistics ==========
unique_users = df['uid'].nunique()
unique_apps = df['app_package'].nunique()
ratings_per_user = df.groupby('uid').size()
average_ratings_per_user = ratings_per_user.mean()
min_ratings_per_user = ratings_per_user.min()
max_ratings_per_user = ratings_per_user.max()

# ========== Category Statistics ==========
unique_categories = df['app_category'].nunique()
apps_per_category = df.groupby('app_category')['app_package'].nunique().sort_values(ascending=False)

# ========== Plot: Apps per Category ==========
plt.figure(figsize=(12, 6))
apps_per_category.plot(kind='bar', color='skyblue')
plt.title('Number of Unique Apps per Category')
plt.xlabel('App Category')
plt.ylabel('Number of Unique Apps')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.grid(axis='y')
plt.show()

# ========== Missing Values ==========
missing_values = {
    "Missing ratings": df['rating'].isnull().sum(),
    "Missing user IDs": df['uid'].isnull().sum(),
    "Missing app categories": df['app_category'].isnull().sum(),
    "Missing app packages": df['app_package'].isnull().sum()
}

print("\n🔍 Missing Value Report:")
for key, val in missing_values.items():
    print(f"{key}: {val}")

# ========== Duplicate Rows ==========
duplicate_rows = df.duplicated().sum()
print(f"\n🧾 Number of duplicate rows: {duplicate_rows}")

# ========== Ratings Info ==========
total_rows = len(df)
non_null_ratings = df['rating'].notnull().sum()
print(f"\n📊 Total rows: {total_rows}")
print(f"✅ Non-null ratings: {non_null_ratings}")

if total_rows == non_null_ratings:
    print("✔️ All rows have a rating.")
else:
    print("⚠️ Some rows are missing a rating.")

# ========== Summary ==========
print("\n📌 Dataset Summary:")
print(f"Unique users: {unique_users}")
print(f"Unique apps: {unique_apps}")
print(f"Unique categories: {unique_categories}")
print(f"Average ratings per user: {average_ratings_per_user:.2f}")
print(f"Min ratings per user: {min_ratings_per_user}")
print(f"Max ratings per user: {max_ratings_per_user}")


## Saving the cleaned df to start working on

In [1]:
import pandas as pd

# Replace this path with the actual location of your CSV file
file_path = "../data/mobilerec_final.csv"

# Load the dataset
df = pd.read_csv(file_path)

# Keep only the 4 required columns
df = df[['rating', 'uid', 'app_category', 'app_package']]

# Save the updated DataFrame back to the original location
df.to_csv(file_path, index=False)

# Confirm the new shape
print(df.shape)


(19297019, 4)


In [None]:
unique_categories = df['app_category'].nunique()
print(f"Number of unique categories: {unique_categories}")
unique_users = df['uid'].nunique()
print(f"Number of unique users: {unique_users}")
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")


Number of unique categories: 48
Number of unique users: 700111
Number of duplicate rows: 35395


NameError: name 'mobile' is not defined

## computing user_entropy and clustering - initiate

In [None]:
mobile = mobile[mobile['app_package'],['user_rating', 'app_category']].drop_duplicates()

Unnamed: 0,app_package,review,rating,votes,date,uid,formated_date,unix_timestamp,app_category
0,com.cleverapps.heroes,It's really a fun game,5,1,"October 21, 2018",shqoc6X1fcJRLEmx,2018-10-21,1540094000.0,Casual
1,com.bodyfast,uninstalling. it was ok but felt like it was c...,2,0,"January 18, 2019",shqoc6X1fcJRLEmx,2019-01-18,1547788000.0,Health & Fitness
2,com.thrivegames.wordshapes,Love this game,4,1,"January 16, 2021",shqoc6X1fcJRLEmx,2021-01-16,1610773000.0,Word
3,com.affinity.rewarded_play,Doesn't update play time. Just downloaded it 3...,1,1,"November 4, 2021",shqoc6X1fcJRLEmx,2021-11-04,1635998000.0,Entertainment
4,dating.inmessage.net,app crashes every time I try to log in. what g...,1,0,"November 24, 2021",shqoc6X1fcJRLEmx,2021-11-24,1637730000.0,Dating


200
500
1000