#### The dataset used in this notebook will be the dumped data from Data Preparation Stage 2.ipynb

In [1]:
import os

from glob import glob
import pandas as pd
import numpy as np

In [2]:
os.chdir('../data')

In [3]:
os.listdir()

['.ipynb_checkpoints', 'v1', 'source_data', 'v2']

In [4]:
files = glob('v2/*.csv')

files

['v2/train.csv', 'v2/test.csv']

In [5]:
df_train = pd.read_csv('v2/train.csv', parse_dates=['last_updated_on'])
df_test = pd.read_csv('v2/test.csv', parse_dates=['last_updated_on'])

In [6]:
df_train.head()

Unnamed: 0,category,rating,reviews,size,price,content_rating,last_updated_on,release_version,os_version_required,downloads
0,Finance,4.18,1481,Varies with device,Free,Everyone,2020-05-05,Varies with device,Varies with device,"100,000+"
1,Music And Audio,4.81,302,10M,Free,Everyone,2020-03-26,3.9.18,4.1 and up,"5,000+"
2,Game Casual,4.27,374,27M,Free,Everyone,2020-05-01,1.10.1,4.1 and up,"10,000+"
3,Business,4.03,122058,Varies with device,Free,Teen,2020-05-02,Varies with device,Varies with device,"10,000,000+"
4,Medical,4.6,358,Varies with device,297.5742,Everyone,2018-11-29,Varies with device,Varies with device,"5,000+"


### Checking the possible set of values in the following columns:

* size
* price
* content_rating
* release_version
* os_version_required

In [7]:
df_train['size'].value_counts()

Varies with device    1884
11M                    395
12M                    340
15M                    323
14M                    321
                      ... 
793k                     1
602k                     1
611k                     1
99k                      1
63k                      1
Name: size, Length: 439, dtype: int64

In [8]:
df_train['price'].value_counts()

Free         15367
222.9942       183
73.8342        162
148.4142       153
372.1542       120
             ...  
369.171          1
178.992          1
342.3222         1
1789.1742        1
319.9482         1
Name: price, Length: 83, dtype: int64

**Conclusion:**

* `Free` will be replace with 0

In [9]:
df_train['content_rating'].value_counts()

Everyone           13836
Teen                1578
Everyone 10+         586
Mature 17+           422
Adults only 18+        4
Unrated                2
Name: content_rating, dtype: int64

In [10]:
df_train['release_version'].value_counts()

Varies with device        1505
1                          769
1.1                        346
1.2                        276
1.3                        229
                          ... 
5.11.40                      1
8.10.0                       1
8.389                        1
Scriptedbreak (1.6.11)       1
V1.0.0-475-gca07a667         1
Name: release_version, Length: 4190, dtype: int64

In [11]:
df_train['os_version_required'].value_counts()

4.1 and up            4348
4.0.3 and up          2543
4.0 and up            1885
4.4 and up            1816
Varies with device    1421
5.0 and up            1276
2.3 and up             751
4.2 and up             689
4.3 and up             347
2.3.3 and up           288
2.2 and up             250
3.0 and up             235
6.0 and up             140
2.1 and up             116
5.1 and up              64
1.6 and up              62
7.0 and up              48
1.5 and up              36
2.0 and up              29
3.2 and up              29
4.4w and up             15
8.0 and up              12
3.1 and up              12
2.0.1 and up             9
7.1 and up               5
1.0 and up               1
1.1 and up               1
Name: os_version_required, dtype: int64

In [12]:
df_train['downloads'].value_counts()

100,000+          3156
1,000,000+        2693
10,000+           2235
500,000+          1353
10,000,000+       1255
50,000+           1242
1,000+            1218
5,000,000+         895
5,000+             763
100+               508
500+               363
100,000,000+       260
50,000,000+        240
10+                 91
50+                 84
500,000,000+        39
1,000,000,000+      32
5,000,000,000+       1
Name: downloads, dtype: int64

In [13]:
df_test['size'].value_counts()

Varies with device    2912
11M                    618
12M                    528
13M                    454
14M                    436
                      ... 
28k                      1
998k                     1
729k                     1
500k                     1
459k                     1
Name: size, Length: 560, dtype: int64

## Transforming training and testing data

### a. Replacing `Free` value of `price` by 0

In [14]:
df_train['price_transformed'] = df_train['price'].str.replace('Free', '0')
df_test['price_transformed'] = df_test['price'].str.replace('Free', '0')


df_train['price_transformed'] = df_train['price_transformed'].astype('float')
df_test['price_transformed'] = df_test['price_transformed'].astype('float')

### b. Replacing `Varies with device` by NaN

In [15]:
df_train['size_transformed'] = df_train['size'].replace('Varies with device', np.nan)
df_train['release_transformed'] = df_train['release_version'].replace('Varies with device', np.nan)
df_train['os_transformed'] = df_train['os_version_required'].replace('Varies with device', np.nan)


df_test['size_transformed'] = df_test['size'].replace('Varies with device', np.nan)
df_test['release_transformed'] = df_test['release_version'].replace('Varies with device', np.nan)
df_test['os_transformed'] = df_test['os_version_required'].replace('Varies with device', np.nan)

### c. Dropping the unnecessary columns 

In [16]:
to_drop = ['price', 'size', 'release_version', 'os_version_required']

df_train.drop(columns=to_drop, inplace=True)
df_test.drop(columns=to_drop, inplace=True)

### d. Ordering the columns

In [17]:
training_data_cols = ['category', 'rating', 'reviews', 'content_rating', 
                      'last_updated_on', 'price_transformed', 'size_transformed',
                      'release_transformed', 'os_transformed', 'downloads']

df_train = df_train[training_data_cols]
df_test = df_test[training_data_cols[:-1]]

### Dumping training and testing data in a new directory¶

In [18]:
if not os.path.exists('v3'):
    os.mkdir('v3')

In [19]:
df_train.to_csv('v3/train.csv', index=False)
df_test.to_csv('v3/test.csv', index=False)