#### The dataset used in this notebook will be the dumped data from `Data Preparation Stage 1.ipynb`

In [1]:
import os

from glob import glob
import pandas as pd

In [2]:
os.chdir('../data')

In [3]:
os.listdir()

['v1', 'source_data']

In [4]:
files = glob('v1/*.csv')

files

['v1/train.csv', 'v1/test.csv']

In [5]:
df_train = pd.read_csv('v1/train.csv')
df_test = pd.read_csv('v1/test.csv')

In [6]:
df_train.head()

Unnamed: 0,offered_by,category,rating,reviews,size,price,content_rating,last_updated_on,release_version,os_version_required,downloads
0,ps_id-24654,Finance,4.18,1481,Varies with device,Free,Everyone,May 05 2020,Varies with device,Varies with device,"100,000+"
1,ps_id-35329,Music And Audio,4.81,302,10M,Free,Everyone,Mar 26 2020,3.9.18,4.1 and up,"5,000+"
2,ps_id-11044,Game Casual,4.27,374,27M,Free,Everyone,May 01 2020,1.10.1,4.1 and up,"10,000+"
3,ps_id-36068,Business,4.03,122058,Varies with device,Free,Teen,May 02 2020,Varies with device,Varies with device,"10,000,000+"
4,ps_id-35831,Medical,4.6,358,Varies with device,297.5742,Everyone,Nov 29 2018,Varies with device,Varies with device,"5,000+"


In [7]:
def print_unique_count(df):
    for col in df.columns:
        print(col, df[col].nunique())

### Checking unique value count of columns in training data

In [8]:
print_unique_count(df_train)

offered_by 15520
category 51
rating 319
reviews 9124
size 439
price 83
content_rating 6
last_updated_on 1583
release_version 4190
os_version_required 27
downloads 18


### Checking unique value count of columns in testing data

In [9]:
print_unique_count(df_test)

offered_by 22794
category 51
rating 327
reviews 12507
size 560
price 95
content_rating 6
last_updated_on 1783
release_version 5681
os_version_required 37


#### Observation(s):

* The column `offered_by` has values masked and the values available in the form of ids

* The count of `offered_by` in test data is more than training data

#### Conclusion(s):

* It shows that the download range doesn't depend upon client (i.e. `offered_by`)

* The column `offered_by` can be dropped from the training and test data

In [10]:
df_train.drop(columns=['offered_by'], inplace=True)
df_test.drop(columns=['offered_by'], inplace=True)

In [11]:
df_train.head()

Unnamed: 0,category,rating,reviews,size,price,content_rating,last_updated_on,release_version,os_version_required,downloads
0,Finance,4.18,1481,Varies with device,Free,Everyone,May 05 2020,Varies with device,Varies with device,"100,000+"
1,Music And Audio,4.81,302,10M,Free,Everyone,Mar 26 2020,3.9.18,4.1 and up,"5,000+"
2,Game Casual,4.27,374,27M,Free,Everyone,May 01 2020,1.10.1,4.1 and up,"10,000+"
3,Business,4.03,122058,Varies with device,Free,Teen,May 02 2020,Varies with device,Varies with device,"10,000,000+"
4,Medical,4.6,358,Varies with device,297.5742,Everyone,Nov 29 2018,Varies with device,Varies with device,"5,000+"


In [12]:
df_test.head()

Unnamed: 0,category,rating,reviews,size,price,content_rating,last_updated_on,release_version,os_version_required
0,Game Puzzle,4.52,1362,36M,Free,Everyone,May 05 2020,1.4.1,4.0.3 and up
1,Shopping,4.29,3353,Varies with device,Free,Teen,May 05 2020,Varies with device,Varies with device
2,Photography,4.55,161855,Varies with device,Free,Everyone,May 05 2020,Varies with device,Varies with device
3,Game Action,3.97,545,16M,Free,Everyone,Dec 07 2016,2,4.1 and up
4,Photography,4.65,1672,33M,Free,Everyone,Jan 25 2020,1,4.4 and up


### Dumping training data and testing data in a new directory

In [13]:
if not os.path.exists('v2'):
    os.mkdir('v2')

In [14]:
df_train.to_csv('v2/train.csv', index=False)
df_test.to_csv('v2/test.csv', index=False)