# 读取数据

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('googleplaystore.csv', usecols=(0, 1, 2, 3, 4, 5, 6))

In [3]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free


In [4]:
df.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


# 因为别的总数10841，而有的不到这个数，说明数据有空值，属于脏数据

In [5]:
df.count()

App         10841
Category    10841
Rating       9367
Reviews     10841
Size        10841
Installs    10841
Type        10840
dtype: int64

## 判断app是否有重复

In [6]:
pd.unique(df['App']).size

9660

上边是10841，唯一的app名是9660，说明APP有重复值，1.9这个有问题

## 看分类

In [7]:
df['Category'].value_counts(dropna=False)  # category 分类

FAMILY                 1972
GAME                   1144
TOOLS                   843
MEDICAL                 463
BUSINESS                460
PRODUCTIVITY            424
PERSONALIZATION         392
COMMUNICATION           387
SPORTS                  384
LIFESTYLE               382
FINANCE                 366
HEALTH_AND_FITNESS      341
PHOTOGRAPHY             335
SOCIAL                  295
NEWS_AND_MAGAZINES      283
SHOPPING                260
TRAVEL_AND_LOCAL        258
DATING                  234
BOOKS_AND_REFERENCE     231
VIDEO_PLAYERS           175
EDUCATION               156
ENTERTAINMENT           149
MAPS_AND_NAVIGATION     137
FOOD_AND_DRINK          127
HOUSE_AND_HOME           88
AUTO_AND_VEHICLES        85
LIBRARIES_AND_DEMO       85
WEATHER                  82
ART_AND_DESIGN           65
EVENTS                   64
COMICS                   60
PARENTING                60
BEAUTY                   53
1.9                       1
Name: Category, dtype: int64

In [8]:
# 看看1.9的是什么问题
df[df['Category'] == '1.9']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0


In [9]:
# 把1.9这个删掉
df.drop(index=10472, inplace=True)

In [10]:
df[df['Category'] == '1.9']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type


## 看评分

In [11]:
df['Rating'].value_counts(dropna=False)

NaN    1474
4.4    1109
4.3    1076
4.5    1038
4.2     952
4.6     823
4.1     708
4.0     568
4.7     499
3.9     386
3.8     303
5.0     274
3.7     239
4.8     234
3.6     174
3.5     163
3.4     128
3.3     102
4.9      87
3.0      83
3.1      69
3.2      64
2.9      45
2.8      42
2.7      25
2.6      25
2.5      21
2.3      20
2.4      19
1.0      16
2.2      14
1.9      13
2.0      12
1.7       8
1.8       8
2.1       8
1.6       4
1.5       3
1.4       3
1.2       1
Name: Rating, dtype: int64

In [12]:
# 使用平均值填充nall的位置
df['Rating'].fillna(value=df['Rating'].mean(), inplace=True)

In [13]:
# 现在看就没啥问题了
df['Rating'].value_counts(dropna=False)

4.191757    1474
4.400000    1109
4.300000    1076
4.500000    1038
4.200000     952
4.600000     823
4.100000     708
4.000000     568
4.700000     499
3.900000     386
3.800000     303
5.000000     274
3.700000     239
4.800000     234
3.600000     174
3.500000     163
3.400000     128
3.300000     102
4.900000      87
3.000000      83
3.100000      69
3.200000      64
2.900000      45
2.800000      42
2.700000      25
2.600000      25
2.500000      21
2.300000      20
2.400000      19
1.000000      16
2.200000      14
1.900000      13
2.000000      12
1.800000       8
1.700000       8
2.100000       8
1.600000       4
1.500000       3
1.400000       3
1.200000       1
Name: Rating, dtype: int64

## 看Reviews

In [14]:
# 看数值型记录
df['Reviews'].value_counts(dropna=False)



0          596
1          272
2          214
3          175
4          137
          ... 
2537         1
7231017      1
155693       1
65119        1
16426        1
Name: Reviews, Length: 6001, dtype: int64

In [15]:
# 看reviews 非数值型的个数
df['Reviews'].str.isnumeric().sum()

10840

In [16]:
# 可以看到非数值型的没有了,因为1.9删除时候给删了
df[~df['Reviews'].str.isnumeric()]

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type


In [17]:
df.describe() # describe reviews没出来，所以类型有问题你

Unnamed: 0,Rating
count,10840.0
mean,4.191757
std,0.478907
min,1.0
25%,4.1
50%,4.2
75%,4.5
max,5.0


In [18]:
# 在这条做一个类型转换
df['Reviews'] = df['Reviews'].astype('i8')

In [19]:
df.describe()  #现在可以出来了

Unnamed: 0,Rating,Reviews
count,10840.0,10840.0
mean,4.191757,444152.9
std,0.478907,2927761.0
min,1.0,0.0
25%,4.1,38.0
50%,4.2,2094.0
75%,4.5,54775.5
max,5.0,78158310.0


## 看SIZE

> M 和K 两种单位没办法统一

In [20]:
df['Size'].value_counts(dropna=False)

Varies with device    1695
11M                    198
12M                    196
14M                    194
13M                    191
                      ... 
157k                     1
924k                     1
50k                      1
1020k                    1
460k                     1
Name: Size, Length: 461, dtype: int64

In [21]:
# 统一单位并替换
df['Size'] = df['Size'].str.replace('M', 'e+6')
df['Size'] = df['Size'].str.replace('k', 'e+3')

In [22]:
df['Size'].value_counts(dropna=False)

Varies with device    1695
11e+6                  198
12e+6                  196
14e+6                  194
13e+6                  191
                      ... 
41e+3                    1
853e+3                   1
160e+3                   1
525e+3                   1
459e+3                   1
Name: Size, Length: 461, dtype: int64

#### 处理上边 varies with device问题，自定义一个方法

In [23]:
# 处理上边 varies with device问题，自定义一个方法
def is_convertable(v):
    try:
        float(v)
        return True
    except:
        return False
tmp = df['Size'].apply(is_convertable)

In [24]:
df['Size'][~tmp].value_counts()   #不能转换的size 数量是多少

Varies with device    1695
Name: Size, dtype: int64

#### 思路：先把varies with device替换成0，再把0替换成平均值

In [25]:
# 替换成0
df['Size'] = df['Size'].str.replace('Varies with device', '0')

In [26]:
# 转换类型  先float 再int
df['Size'] = df['Size'].astype('f8').astype('i8')
df['Size']

0        19000000
1        14000000
2         8700000
3        25000000
4         2800000
           ...   
10836    53000000
10837     3600000
10838     9500000
10839           0
10840    19000000
Name: Size, Length: 10840, dtype: int64

In [27]:
# 替换平均值
df['Size'].replace(0, df['Size'].mean(), inplace=True)
df.describe()

Unnamed: 0,Rating,Reviews,Size
count,10840.0,10840.0,10840.0
mean,4.191757,444152.9,20990450.0
std,0.478907,2927761.0,20783450.0
min,1.0,0.0,8500.0
25%,4.1,38.0,5900000.0
50%,4.2,2094.0,18000000.0
75%,4.5,54775.5,26000000.0
max,5.0,78158310.0,100000000.0


## 看Installs 安装数量


In [28]:
df['Installs'].value_counts(dropna=False)

1,000,000+        1579
10,000,000+       1252
100,000+          1169
10,000+           1054
1,000+             907
5,000,000+         752
100+               719
500,000+           539
50,000+            479
5,000+             477
100,000,000+       409
10+                386
500+               330
50,000,000+        289
50+                205
5+                  82
500,000,000+        72
1+                  67
1,000,000,000+      58
0+                  14
0                    1
Name: Installs, dtype: int64

### 有➕，不方便计算，因此需要消除

In [32]:
df['Installs'] = df['Installs'].str.replace('+', '')
df['Installs'] = df['Installs'].str.replace(',', '')
df['Installs'].value_counts(dropna=False)
df['Installs'] = df['Installs'].astype('i8')

In [33]:
df.describe()   #现在就有installs 这列了

Unnamed: 0,Rating,Reviews,Size,Installs
count,10840.0,10840.0,10840.0,10840.0
mean,4.191757,444152.9,20990450.0,15464340.0
std,0.478907,2927761.0,20783450.0,85029360.0
min,1.0,0.0,8500.0,0.0
25%,4.1,38.0,5900000.0,1000.0
50%,4.2,2094.0,18000000.0,100000.0
75%,4.5,54775.5,26000000.0,5000000.0
max,5.0,78158310.0,100000000.0,1000000000.0


## 处理Type列

In [34]:
df['Type'].value_counts(dropna=False)   #看收费、免费、空值个数

Free    10039
Paid      800
NaN         1
Name: Type, dtype: int64

In [37]:
df[df['Type'].isnull()]  # type是null值的放到索引里，展示，如下表

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type
9148,Command & Conquer: Rivals,FAMILY,4.191757,0,18152090.0,0,


In [38]:
df.drop(index=9148, inplace=True)  # 可以查一下补进去，也可以删掉，这里选择删掉

In [39]:
df['Type'].value_counts(dropna=False) # 可以看到只剩free 和paid了

Free    10039
Paid      800
Name: Type, dtype: int64

## 回头看重复的

In [40]:
df.drop_duplicates('App', inplace=True)

In [41]:
pd.unique(df['App']).size

9658

In [42]:
df.describe()  #可以看到所有的count都是9658个了

Unnamed: 0,Rating,Reviews,Size,Installs
count,9658.0,9658.0,9658.0,9658.0
mean,4.176046,216615.0,20110530.0,7778312.0
std,0.494383,1831413.0,20408650.0,53761000.0
min,1.0,0.0,8500.0,0.0
25%,4.0,25.0,5300000.0,1000.0
50%,4.2,967.0,16000000.0,100000.0
75%,4.5,29408.0,25000000.0,1000000.0
max,5.0,78158310.0,100000000.0,1000000000.0


In [None]:
df.corr()

## 以上就是数据清洗的一项案例