# Imports

In [1]:
import pandas as pd
import numpy as np

# Load and Display

In [2]:
# Load the dataset
data = pd.read_csv('Playstore_final.csv')

# Display the first few rows of the dataset
print("Initial Data:")
print(data.head())

  data = pd.read_csv('Playstore_final.csv')


Initial Data:
                  App Name                                             App Id  \
0     Logistics Management              com.eniseistudio.logistics_management   
1  Estados Unidos Noticias               com.eniseistudio.news.estados_unidos   
2         Dental Assistant                  com.eniseistudio.dental_assistant   
3        Medical Assistant          com.eniseistudio.course.medical_assistant   
4  Business Administration  com.eniseistudio.majors.course.business_admini...   

           Category    Rating  Rating Count Installs  Minimum Installs  Free  \
0         Education  4.090909          66.0  10,000+           10000.0  True   
1  News & Magazines  4.000000           8.0   1,000+            1000.0  True   
2         Education  3.866667          15.0  10,000+           10000.0  True   
3         Education  4.000000          18.0   5,000+            5000.0  True   
4         Education  4.023256          86.0  50,000+           50000.0  True   

   Price Currency 

# Convert 'Installs' and 'Minimum Installs' columns to numeric

In [3]:
data['Installs'] = data['Installs'].str.replace('+', '').str.replace(',', '').astype(float)
data['Minimum Installs'] = data['Minimum Installs'].astype(float)

# Handle Missing Values

In [4]:
data.fillna({
    'Rating': data['Rating'].mean(),
    'Rating Count': 0,
    'Size': 'Varies with device',
    'Developer Website': 'N/A',
    'Privacy Policy': 'N/A'
}, inplace=True)

# Convert 'Released' and 'Last update' columns to datetime

In [5]:
data['Released'] = pd.to_datetime(data['Released'], format='%d-%b-%y', errors='coerce')
data['Last update'] = pd.to_datetime(data['Last update'], format='%d-%b-%y', errors='coerce')

# Remove leading and trailing whitespaces from string columns

In [6]:
data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)

  data = data.applymap(lambda x: x.strip() if isinstance(x, str) else x)


# Remove any duplicate rows

In [7]:
data.drop_duplicates(inplace=True)

# Replace 'N/A' with NaN for better handling in future analysis

In [8]:
data.replace('N/A', np.nan, inplace=True)

# Drop columns with all missing values

In [9]:
data.dropna(how='all', axis=1, inplace=True)

In [10]:
def convert_size(size_str):
    if ',' in size_str:
        size_str = size_str.replace(',', '.')
    if 'M' in size_str:
        return float(size_str.replace('M', '')) * 1e6
    elif 'k' in size_str:
        return float(size_str.replace('k', '')) * 1e3
    else:
        return np.nan

data['Size'] = data['Size'].apply(convert_size)
data = data.dropna(subset=[
    'Rating', 'Rating Count', 'Installs', 'Size', 'Content Rating', 'In app purchases', 'Category', 'Free', 'Ad Supported'
])

In [11]:
try:
    data = data[data.columns.drop(list(data.filter(regex='Unnamed:')))]
except Exception as e:
    print(f"An unexpected error occurred: {str(e)}")

# Display Info

In [12]:
print("\nCleaned Data Info:")
print(data.info())


Cleaned Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 254002 entries, 0 to 450791
Data columns (total 29 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   App Name               254002 non-null  object        
 1   App Id                 254002 non-null  object        
 2   Category               254002 non-null  object        
 3   Rating                 254002 non-null  float64       
 4   Rating Count           254002 non-null  float64       
 5   Installs               254002 non-null  float64       
 6   Minimum Installs       254002 non-null  float64       
 7   Free                   254002 non-null  object        
 8   Price                  254002 non-null  float64       
 9   Currency               254002 non-null  object        
 10  Size                   254002 non-null  float64       
 11  Minimum Android        253568 non-null  object        
 12  Developer Id           254002

# Save cleaned data to a new CSV file

In [13]:
data.to_csv('cleaned_dataset.csv', index=False)