In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('./googleplaystore.csv')

In [3]:
df.dropna(inplace = True)

# Cleaning Categories into integers
CategoryString = df["Category"]
categoryVal = df["Category"].unique()
categoryValCount = len(categoryVal)
category_dict = {}

for i in range(0,categoryValCount):
    category_dict[categoryVal[i]] = i
df["Category_c"] = df["Category"].map(category_dict).astype(int)

#scaling and cleaning size of installation
def change_size(size):
    if 'M' in size:
        x = size[:-1]
        x = float(x)*1000000
        return(x)
    elif 'k' == size[-1:]:
        x = size[:-1]
        x = float(x)*1000
        return(x)
    else:
        return None

df["Size"] = df["Size"].map(change_size)

#filling Size which had NA
df.Size.fillna(method = 'ffill', inplace = True)


#Cleaning no of installs classification
df['Installs'] = [int(i[:-1].replace(',','')) for i in df['Installs']]

#Converting Type classification into binary
def type_cat(types):
    if types == 'Free':
        return 0
    else:
        return 1

df['Type'] = df['Type'].map(type_cat)

#Cleaning of content rating classification
RatingL = df['Content Rating'].unique()
RatingDict = {}
for i in range(len(RatingL)):
    RatingDict[RatingL[i]] = i
df['Content Rating'] = df['Content Rating'].map(RatingDict).astype(int)

#dropping of unrelated and unnecessary items
df.drop(labels = ['Last Updated','Current Ver','Android Ver','App'], axis = 1, inplace = True)

#Cleaning of genres
GenresL = df.Genres.unique()
GenresDict = {}
for i in range(len(GenresL)):
    GenresDict[GenresL[i]] = i
df['Genres_c'] = df['Genres'].map(GenresDict).astype(int)

#Cleaning prices
def price_clean(price):
    if price == '0':
        return 0
    else:
        price = price[1:]
        price = float(price)
        return price

df['Price'] = df['Price'].map(price_clean).astype(float)

# convert reviews to numeric
df['Reviews'] = df['Reviews'].astype(int)

In [4]:
df.describe()

Unnamed: 0,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Category_c,Genres_c
count,9360.0,9360.0,9360.0,9360.0,9360.0,9360.0,9360.0,9360.0,9360.0
mean,4.191838,514376.7,23143470.0,17908750.0,0.06891,0.961279,0.350214,17.782799,49.806731
std,0.515263,3145023.0,23245150.0,91266370.0,0.253315,15.82164,0.783552,7.329874,34.100336
min,1.0,1.0,8500.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,186.75,5500000.0,10000.0,0.0,0.0,0.0,14.0,19.0
50%,4.3,5955.0,15000000.0,500000.0,0.0,0.0,0.0,18.0,38.0
75%,4.5,81627.5,33000000.0,5000000.0,0.0,0.0,0.0,23.0,88.0
max,5.0,78158310.0,100000000.0,1000000000.0,1.0,400.0,5.0,32.0,114.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9360 entries, 0 to 10840
Data columns (total 11 columns):
Category          9360 non-null object
Rating            9360 non-null float64
Reviews           9360 non-null int32
Size              9360 non-null float64
Installs          9360 non-null int64
Type              9360 non-null int64
Price             9360 non-null float64
Content Rating    9360 non-null int32
Genres            9360 non-null object
Category_c        9360 non-null int32
Genres_c          9360 non-null int32
dtypes: float64(3), int32(4), int64(2), object(2)
memory usage: 731.2+ KB


In [6]:
df.head()

Unnamed: 0,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Category_c,Genres_c
0,ART_AND_DESIGN,4.1,159,19000000.0,10000,0,0.0,0,Art & Design,0,0
1,ART_AND_DESIGN,3.9,967,14000000.0,500000,0,0.0,0,Art & Design;Pretend Play,0,1
2,ART_AND_DESIGN,4.7,87510,8700000.0,5000000,0,0.0,0,Art & Design,0,0
3,ART_AND_DESIGN,4.5,215644,25000000.0,50000000,0,0.0,1,Art & Design,0,0
4,ART_AND_DESIGN,4.3,967,2800000.0,100000,0,0.0,0,Art & Design;Creativity,0,2


In [7]:
# for dummy variable encoding for Categories
df2 = pd.get_dummies(df, columns=['Category'])

In [8]:
df2.head()

Unnamed: 0,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Category_c,Genres_c,...,Category_PERSONALIZATION,Category_PHOTOGRAPHY,Category_PRODUCTIVITY,Category_SHOPPING,Category_SOCIAL,Category_SPORTS,Category_TOOLS,Category_TRAVEL_AND_LOCAL,Category_VIDEO_PLAYERS,Category_WEATHER
0,4.1,159,19000000.0,10000,0,0.0,0,Art & Design,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3.9,967,14000000.0,500000,0,0.0,0,Art & Design;Pretend Play,0,1,...,0,0,0,0,0,0,0,0,0,0
2,4.7,87510,8700000.0,5000000,0,0.0,0,Art & Design,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.5,215644,25000000.0,50000000,0,0.0,1,Art & Design,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.3,967,2800000.0,100000,0,0.0,0,Art & Design;Creativity,0,2,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#for evaluation of error term and 
def Evaluationmatrix(y_true, y_predict):
    print ('Mean Squared Error: '+ str(metrics.mean_squared_error(y_true,y_predict)))
    print ('Mean absolute Error: '+ str(metrics.mean_absolute_error(y_true,y_predict)))
    print ('Mean squared Log Error: '+ str(metrics.mean_squared_log_error(y_true,y_predict)))

In [10]:
#to add into results_index for evaluation of error term 
def Evaluationmatrix_dict(y_true, y_predict, name = 'Linear - Integer'):
    dict_matrix = {}
    dict_matrix['Series Name'] = name
    dict_matrix['Mean Squared Error'] = metrics.mean_squared_error(y_true,y_predict)
    dict_matrix['Mean Absolute Error'] = metrics.mean_absolute_error(y_true,y_predict)
    dict_matrix['Mean Squared Log Error'] = metrics.mean_squared_log_error(y_true,y_predict)
    return dict_matrix

In [11]:
# X = df.drop(labels = ['Category','Rating','Genres','Genres_c'],axis = 1)

X = df.drop(labels = ['Rating', 'Category','Genres'],axis = 1)
y = df.Rating
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
X_train.shape, X_test.shape

((7488, 8), (1872, 8))

## Train Deep

In [12]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, Activation, MaxPooling2D, BatchNormalization, Dropout, Flatten, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import RMSprop, Adam, Adagrad, SGD

model_path = './best_model.h5'
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5'

In [13]:
model = Sequential()

# The Input Layer :
model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

# The Hidden Layers :
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))
model.add(Dense(256, kernel_initializer='normal',activation='relu'))

# The Output Layer :
model.add(Dense(1, kernel_initializer='normal',activation='linear'))

W0421 19:14:15.597040 13320 deprecation.py:506] From C:\Users\ruji-\Anaconda3\lib\site-packages\tensorflow_core\python\keras\initializers.py:143: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0421 19:14:15.613992 13320 deprecation.py:506] From C:\Users\ruji-\Anaconda3\lib\site-packages\tensorflow_core\python\ops\resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [14]:
# Compile the network :

opt = Adam(lr=0.01, decay=1e-6)
model.compile(loss='mean_squared_error', optimizer=opt, metrics=['mean_squared_error'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1152      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_3 (Dense)              (None, 256)               65792     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 166,017
Trainable params: 166,017
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Train Model
batch_size = 2
checkpoint = ModelCheckpoint(model_path, verbose=1, monitor='val_mean_squared_error',save_best_only=True, mode='auto') 
model.fit(X_train, y_train, batch_size=batch_size, epochs=50, callbacks=[checkpoint], validation_data=(X_test, y_test), shuffle=True)

# Load Model
model = load_model(model_path)
scores = model.evaluate(X_test, y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])

Train on 7488 samples, validate on 1872 samples
Epoch 1/50
Epoch 00001: val_mean_squared_error improved from inf to 821.20306, saving model to ./best_model.h5
Epoch 2/50
Epoch 00002: val_mean_squared_error improved from 821.20306 to 17.61646, saving model to ./best_model.h5
Epoch 3/50
Epoch 00003: val_mean_squared_error improved from 17.61646 to 16.90409, saving model to ./best_model.h5
Epoch 4/50
Epoch 00004: val_mean_squared_error improved from 16.90409 to 11.87542, saving model to ./best_model.h5
Epoch 5/50
Epoch 00005: val_mean_squared_error improved from 11.87542 to 1.47914, saving model to ./best_model.h5
Epoch 6/50
Epoch 00006: val_mean_squared_error improved from 1.47914 to 0.22545, saving model to ./best_model.h5
Epoch 7/50
Epoch 00007: val_mean_squared_error did not improve from 0.22545
Epoch 8/50
Epoch 00008: val_mean_squared_error did not improve from 0.22545
Epoch 9/50
Epoch 00009: val_mean_squared_error improved from 0.22545 to 0.22541, saving model to ./best_model.h5
Epo

Epoch 25/50
Epoch 00025: val_mean_squared_error did not improve from 0.22541
Epoch 26/50
Epoch 00026: val_mean_squared_error did not improve from 0.22541
Epoch 27/50
Epoch 00027: val_mean_squared_error improved from 0.22541 to 0.22541, saving model to ./best_model.h5
Epoch 28/50
Epoch 00028: val_mean_squared_error improved from 0.22541 to 0.22540, saving model to ./best_model.h5
Epoch 29/50
Epoch 00029: val_mean_squared_error did not improve from 0.22540
Epoch 30/50
Epoch 00030: val_mean_squared_error did not improve from 0.22540
Epoch 31/50
Epoch 00031: val_mean_squared_error did not improve from 0.22540
Epoch 32/50
Epoch 00032: val_mean_squared_error did not improve from 0.22540
Epoch 33/50
Epoch 00033: val_mean_squared_error did not improve from 0.22540
Epoch 34/50
Epoch 00034: val_mean_squared_error did not improve from 0.22540
Epoch 35/50
Epoch 00035: val_mean_squared_error did not improve from 0.22540
Epoch 36/50
Epoch 00036: val_mean_squared_error did not improve from 0.22540
Ep

Epoch 50/50
Epoch 00050: val_mean_squared_error did not improve from 0.22540


W0421 19:19:40.907205 13320 deprecation.py:506] From C:\Users\ruji-\Anaconda3\lib\site-packages\tensorflow_core\python\ops\init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Test loss: 0.2254050099569508
Test accuracy: 0.22540502
