## Machine Learning to predict App Ratings

### Data Cleaning

In [129]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
import random
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [130]:
df = pd.read_csv('google_play_store_train.csv')

In [131]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10474 entries, 0 to 10473
Data columns (total 15 columns):
App_ID            10474 non-null int64
App               10474 non-null object
Category          10474 non-null object
Rating            9040 non-null float64
Reviews           10474 non-null object
Size              10474 non-null object
Installs          10474 non-null object
Type              10473 non-null object
Price             10474 non-null object
Content Rating    10473 non-null object
Genres            10474 non-null object
Last Updated      10474 non-null object
Current Ver       10467 non-null object
Android Ver       10471 non-null object
Unnamed: 14       479 non-null object
dtypes: float64(1), int64(1), object(13)
memory usage: 1.2+ MB
None


In [132]:
len(df)

10474

In [133]:
df = df.iloc[0:,0:-1]

In [134]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10474 entries, 0 to 10473
Data columns (total 14 columns):
App_ID            10474 non-null int64
App               10474 non-null object
Category          10474 non-null object
Rating            9040 non-null float64
Reviews           10474 non-null object
Size              10474 non-null object
Installs          10474 non-null object
Type              10473 non-null object
Price             10474 non-null object
Content Rating    10473 non-null object
Genres            10474 non-null object
Last Updated      10474 non-null object
Current Ver       10467 non-null object
Android Ver       10471 non-null object
dtypes: float64(1), int64(1), object(12)
memory usage: 1.1+ MB


In [135]:
df.dropna(inplace = True)

In [136]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9034 entries, 0 to 10473
Data columns (total 14 columns):
App_ID            9034 non-null int64
App               9034 non-null object
Category          9034 non-null object
Rating            9034 non-null float64
Reviews           9034 non-null object
Size              9034 non-null object
Installs          9034 non-null object
Type              9034 non-null object
Price             9034 non-null object
Content Rating    9034 non-null object
Genres            9034 non-null object
Last Updated      9034 non-null object
Current Ver       9034 non-null object
Android Ver       9034 non-null object
dtypes: float64(1), int64(1), object(12)
memory usage: 1.0+ MB


In [137]:
df['Category'] = pd.Categorical(df['Category'])
df_dummies = pd.get_dummies(df['Category'], prefix = 'category')
df = pd.concat([df, df_dummies], axis=1)

In [138]:
df_dummies = pd.get_dummies(df['Content Rating'], prefix = 'category')
df = pd.concat([df, df_dummies], axis = 1)

In [139]:
df_dummies = pd.get_dummies(df['Genres'], prefix = 'category')
df = pd.concat([df, df_dummies], axis = 1)

In [140]:
def size_refiner(size):
    x = None
    if 'M' in size:
        x = size[0:-1]
        x = float(x)*1000000
    elif 'k' == size[-1:]:
        x = size[:-1]
        x = float(x)*1000
        return(x)
    
    return x


df["Size"] = df["Size"].map(size_refiner)

In [141]:
df['Size'] = df.Size.fillna(method = 'ffill')

In [142]:
df['Installs'] = [int(i[:-1].replace(',','')) for i in df['Installs']]

In [143]:
def to_free_or_not_to_free(type):
    if(type == 'Free'):
        return 1
    else:
        return 0
    
df['Type'] = df['Type'].map(to_free_or_not_to_free)

In [144]:
df.drop(labels = ['Last Updated','Current Ver','Android Ver','App', 'Category', 'Content Rating'], axis = 1, inplace = True)

In [145]:
def price_clean(price):
    if price == '0':
        return 0
    else:
        price = price[1:]
        price = float(price)
        return price

df['Price'] = df['Price'].map(price_clean).astype(float)

In [146]:
df['Reviews'] = df['Reviews'].astype(int)

In [147]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9034 entries, 0 to 10473
Data columns (total 95 columns):
App_ID                              9034 non-null int64
Rating                              9034 non-null float64
Reviews                             9034 non-null int32
Size                                9034 non-null float64
Installs                            9034 non-null int64
Type                                9034 non-null int64
Price                               9034 non-null float64
Genres                              9034 non-null object
category_ART_AND_DESIGN             9034 non-null uint8
category_AUTO_AND_VEHICLES          9034 non-null uint8
category_BEAUTY                     9034 non-null uint8
category_BOOKS_AND_REFERENCE        9034 non-null uint8
category_BUSINESS                   9034 non-null uint8
category_COMICS                     9034 non-null uint8
category_COMMUNICATION              9034 non-null uint8
category_DATING                     9034 non-