In [224]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno

df = pd.read_csv('googleplaystore.csv')
df.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
6918,BW-IVMS,PRODUCTIVITY,,0,17M,100+,Free,0,Everyone,Productivity,26-Jul-18,12.0.18071400,4.1 and up
9994,EW Gate,BUSINESS,,0,12M,50+,Free,0,Everyone,Business,16-Apr-18,3.07,4.0 and up
1912,Shoot Bubble - Fruit Splash,GAME,4.6,29445,29M,"5,000,000+",Free,0,Everyone,Casual,11-May-18,14,4.0.3 and up
10162,EZ Screenshot,TOOLS,4.4,55,1.4M,"5,000+",Free,0,Everyone,Tools,12-Sep-17,1.1.4,5.0 and up
8680,DP Maker,PHOTOGRAPHY,4.2,784,18M,"100,000+",Free,0,Everyone,Photography,29-Apr-18,1.00.15,4.1 and up


In [225]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


###  Find columns with null values

In [226]:
df.isna().sum().sort_values(ascending=False)

Rating            1474
Current Ver          8
Android Ver          3
Content Rating       1
Type                 1
Size                 0
Reviews              0
Category             0
App                  0
Price                0
Installs             0
Last Updated         0
Genres               0
dtype: int64

### Clean the Rating column and the other columns containing null values

1. Remove the invalid values from Rating (if any). Just set them as NaN.
2. Fill the null values in the Rating column using the mean()
3. Clean any other non-numerical columns by just dropping the values.

All the ratings that are not in the range of 0 to 5 should be replaced with NaN.
For the other columns, just drop the rows that contain null values.

In [227]:
df[df['Rating'] > 5].head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,11-Feb-18,1.0.19,4.0 and up,


In [228]:
df.loc[df['Rating'] > 5, 'Rating'] = np.nan

In [229]:
df['Rating'] = df['Rating'].fillna(df['Rating'].mean())

In [230]:
df.dropna(inplace=True)   # Dropping the rows with any missing column values

### Clean the column Reviews and make it numeric

You'll notice that some columns from this dataframe which should be numeric, were parsed as object (string). That's because sometimes the numbers are expressed with M, or k to indicate Mega or kilo.

Clean the Reviews column by transforming the values to the correct numeric representation. For example, 5M should be 5000000

In [231]:
# {'ignore', 'raise', 'coerce'}, default 'raise'

# If 'raise', then invalid parsing will raise an exception.
# If 'coerce', then invalid parsing will be set as NaN.
# If 'ignore', then invalid parsing will return the input.

df['Reviews Numeric'] = pd.to_numeric(df['Reviews'], errors='coerce')

df[df['Reviews Numeric'].isna()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Reviews Numeric
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,2M,25M,"50,000,000+",Free,0,Teen,Art & Design,8-Jun-18,Varies with device,4.2 and up,
502,Find Real Love — YouLove Premium Dating,DATING,4.5,2M,11M,"10,000,000+",Free,0,Mature 17+,Dating,31-Jul-18,4.17.2,4.1 and up,
503,Once - Quality Matches Every day,DATING,4.4,2M,21M,"1,000,000+",Free,0,Mature 17+,Dating,17-Jul-18,2.45,4.1 and up,


In [232]:
# Convert 'Reviews' column to numeric by handling 'M' and 'k' suffixes
def convert_reviews(value):
    if 'M' in value:
        return float(value.replace('M', '')) * 1_000_000
    elif 'k' in value:
        return float(value.replace('k', '')) * 1_000
    else:
        return float(value)

In [233]:
# this code was applying the conversion function to all rows, even those that were already numeric, which was unnecessary work.
# df['Reviews'] = df['Reviews'].apply(convert_reviews)


# Only convert rows where Reviews Numeric is NaN (non-numeric values)
mask = df['Reviews Numeric'].isna()
df.loc[mask, 'Reviews'] = df.loc[mask, 'Reviews'].apply(convert_reviews)

# Drop the temporary Reviews Numeric column
df.drop('Reviews Numeric', axis=1, inplace=True)

df['Reviews'] = pd.to_numeric(df['Reviews'])
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19M,"10,000+",Free,0,Everyone,Art & Design,7-Jan-18,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,15-Jan-18,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,1-Aug-18,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,2000000.0,25M,"50,000,000+",Free,0,Teen,Art & Design,8-Jun-18,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,20-Jun-18,1.1,4.4 and up


In [234]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10829 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10829 non-null  object 
 1   Category        10829 non-null  object 
 2   Rating          10829 non-null  float64
 3   Reviews         10829 non-null  float64
 4   Size            10829 non-null  object 
 5   Installs        10829 non-null  object 
 6   Type            10829 non-null  object 
 7   Price           10829 non-null  object 
 8   Content Rating  10829 non-null  object 
 9   Genres          10829 non-null  object 
 10  Last Updated    10829 non-null  object 
 11  Current Ver     10829 non-null  object 
 12  Android Ver     10829 non-null  object 
dtypes: float64(2), object(11)
memory usage: 1.2+ MB


### How many duplicated apps are there?

Count the number of duplicated rows. That is, if the app Twitter appears 2 times, that counts as 2.

In [235]:
df[df.duplicated(subset=['App'], keep= False)].sort_values(by='App').head(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1393,10 Best Foods for You,HEALTH_AND_FITNESS,4.0,2490.0,3.8M,"500,000+",Free,0,Everyone 10+,Health & Fitness,17-Feb-17,1.9,2.3.3 and up
1407,10 Best Foods for You,HEALTH_AND_FITNESS,4.0,2490.0,3.8M,"500,000+",Free,0,Everyone 10+,Health & Fitness,17-Feb-17,1.9,2.3.3 and up
2543,1800 Contacts - Lens Store,MEDICAL,4.7,23160.0,26M,"1,000,000+",Free,0,Everyone,Medical,27-Jul-18,7.4.1,5.0 and up
2322,1800 Contacts - Lens Store,MEDICAL,4.7,23160.0,26M,"1,000,000+",Free,0,Everyone,Medical,27-Jul-18,7.4.1,5.0 and up
2385,2017 EMRA Antibiotic Guide,MEDICAL,4.4,12.0,3.8M,"1,000+",Paid,$16.99,Everyone,Medical,27-Jan-17,1.0.5,4.0.3 and up
2256,2017 EMRA Antibiotic Guide,MEDICAL,4.4,12.0,3.8M,"1,000+",Paid,$16.99,Everyone,Medical,27-Jan-17,1.0.5,4.0.3 and up
1337,21-Day Meditation Experience,HEALTH_AND_FITNESS,4.4,11506.0,15M,"100,000+",Free,0,Everyone,Health & Fitness,2-Aug-18,3.0.0,4.1 and up
1434,21-Day Meditation Experience,HEALTH_AND_FITNESS,4.4,11506.0,15M,"100,000+",Free,0,Everyone,Health & Fitness,2-Aug-18,3.0.0,4.1 and up
3083,365Scores - Live Scores,SPORTS,4.6,666521.0,25M,"10,000,000+",Free,0,Everyone,Sports,29-Jul-18,5.5.9,4.1 and up
5415,365Scores - Live Scores,SPORTS,4.6,666246.0,25M,"10,000,000+",Free,0,Everyone,Sports,29-Jul-18,5.5.9,4.1 and up


In [236]:
print(df.duplicated(subset=['App'], keep= False).sum())

1979


### Drop duplicated apps keeping only the ones with the greatest number of reviews

Drop duplicated apps, keeping just one copy of each, the one with the greatest number of reviews.

In [237]:
df[df.duplicated(subset=['App'], keep= False)].sort_values(by=['App', 'Reviews']).head(20)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1393,10 Best Foods for You,HEALTH_AND_FITNESS,4.0,2490.0,3.8M,"500,000+",Free,0,Everyone 10+,Health & Fitness,17-Feb-17,1.9,2.3.3 and up
1407,10 Best Foods for You,HEALTH_AND_FITNESS,4.0,2490.0,3.8M,"500,000+",Free,0,Everyone 10+,Health & Fitness,17-Feb-17,1.9,2.3.3 and up
2322,1800 Contacts - Lens Store,MEDICAL,4.7,23160.0,26M,"1,000,000+",Free,0,Everyone,Medical,27-Jul-18,7.4.1,5.0 and up
2543,1800 Contacts - Lens Store,MEDICAL,4.7,23160.0,26M,"1,000,000+",Free,0,Everyone,Medical,27-Jul-18,7.4.1,5.0 and up
2256,2017 EMRA Antibiotic Guide,MEDICAL,4.4,12.0,3.8M,"1,000+",Paid,$16.99,Everyone,Medical,27-Jan-17,1.0.5,4.0.3 and up
2385,2017 EMRA Antibiotic Guide,MEDICAL,4.4,12.0,3.8M,"1,000+",Paid,$16.99,Everyone,Medical,27-Jan-17,1.0.5,4.0.3 and up
1337,21-Day Meditation Experience,HEALTH_AND_FITNESS,4.4,11506.0,15M,"100,000+",Free,0,Everyone,Health & Fitness,2-Aug-18,3.0.0,4.1 and up
1434,21-Day Meditation Experience,HEALTH_AND_FITNESS,4.4,11506.0,15M,"100,000+",Free,0,Everyone,Health & Fitness,2-Aug-18,3.0.0,4.1 and up
5415,365Scores - Live Scores,SPORTS,4.6,666246.0,25M,"10,000,000+",Free,0,Everyone,Sports,29-Jul-18,5.5.9,4.1 and up
3083,365Scores - Live Scores,SPORTS,4.6,666521.0,25M,"10,000,000+",Free,0,Everyone,Sports,29-Jul-18,5.5.9,4.1 and up


In [238]:
df.drop_duplicates(subset=['App'], keep='last', inplace=True)

### Format the Category column

Categories are all uppercase and words are separated using underscores. Instead, we want them with capitalized in the first character and the underscores transformed as whitespaces.

Example, the category AUTO_AND_VEHICLES should be transformed to: Auto and vehicles. Also, if you find any other wrong value transform it into an Unknown category.

In [239]:
df['Category'] = df['Category'].str.replace('_', ' ')
df['Category'] = df['Category'].str.title()

df['Category'].value_counts().head()

Category
Family      1902
Game         926
Tools        827
Business     419
Medical      396
Name: count, dtype: int64

### Clean and convert the Installs column to numeric type

Clean and transform Installs as a numeric type. Some values in Installs will have a + modifier. Just remove the string and honor the original number (for example +2,500 or 2,500+ should be transformed to the number 2500).

In [240]:
demo_df = pd.to_numeric(df['Installs'].str.replace('+', '').str.replace(',', ''))
df['Installs'] = demo_df
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art And Design,4.1,159.0,19M,10000,Free,0,Everyone,Art & Design,7-Jan-18,1.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art And Design,4.7,87510.0,8.7M,5000000,Free,0,Everyone,Art & Design,1-Aug-18,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,Art And Design,4.5,2000000.0,25M,50000000,Free,0,Teen,Art & Design,8-Jun-18,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,Art And Design,4.3,967.0,2.8M,100000,Free,0,Everyone,Art & Design;Creativity,20-Jun-18,1.1,4.4 and up
5,Paper flowers instructions,Art And Design,4.4,167.0,5.6M,50000,Free,0,Everyone,Art & Design,26-Mar-17,1,2.3 and up


### Clean and convert the Size column to numeric (representing bytes)

The Size column is of type object. Some values contain either a M or a k that indicate Kilobytes (1024 bytes) or Megabytes (1024 kb). These values should be transformed to their corresponding value in bytes. For example, 898k will become 919552 (898 * 1024).

Some other values are completely invalid (there's no way to infer the numeric type from them). For these, just replace the value for 0.

Some other rules are related to + modifiers, apply the same rules as the previous task.

In [241]:
df['Size'] = df['Size'].str.replace('Varies with device', '0').str.replace('+', '').str.replace(',', '')


# Function to convert Size values to bytes
def convert_size(value):
    if 'M' in value:
        return float(value.replace('M', '')) * 1024 * 1024 
    elif 'k' in value:
        return float(value.replace('k', '')) * 1024  
    elif value.isdigit():
        return float(value)  # Already numeric
    else:
        return 0  # Invalid values are replaced with 0

df['Size'] = df['Size'].apply(convert_size)

df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,Art And Design,4.1,159.0,19922944.0,10000,Free,0,Everyone,Art & Design,7-Jan-18,1.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",Art And Design,4.7,87510.0,9122611.2,5000000,Free,0,Everyone,Art & Design,1-Aug-18,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,Art And Design,4.5,2000000.0,26214400.0,50000000,Free,0,Teen,Art & Design,8-Jun-18,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,Art And Design,4.3,967.0,2936012.8,100000,Free,0,Everyone,Art & Design;Creativity,20-Jun-18,1.1,4.4 and up
5,Paper flowers instructions,Art And Design,4.4,167.0,5872025.6,50000,Free,0,Everyone,Art & Design,26-Mar-17,1,2.3 and up


### Clean and convert the Price column to numeric

In [242]:
df['Price'] = df['Price'].str.replace('$', '').str.replace('Free', '0')
df['Price'] = pd.to_numeric(df['Price'])


### Paid or free?
Now that you have cleaned the Price column, let's create another auxiliary Distribution column.
This column should contain Free/Paid values depending on the app's price.

In [244]:
df['Distribution'] = 'Free'
df.loc[df['Price'] > 0, 'Distribution'] = 'Paid'

df['Distribution'].value_counts()

Distribution
Free    8897
Paid     751
Name: count, dtype: int64