# Google Play Data Analysis

## Read Data

In [None]:
import pandas as pd
import numpy as np
datafile = "googleplaystore.csv"
df = pd.read_csv(datafile)

- only the Rating column is numerical

## Data Preprocessing

#### 1. convert number of reviews to int

In [None]:
# def one_hot_encoder(array):
#     from sklearn.preprocessing import LabelEncoder
#     encoder = LabelEncoder()
#     encoder.fit(array)
#     coded_array = encoder.transform(array)
#     n = len(coded_array)
#     n_labels = len(np.unique(coded_array))
#     one_hot = np.zeros((n,n_labels))
#     one_hot[np.arange(n), coded_array] = 1
#     return one_hot

# category_df = pd.DataFrame(one_hot_encoder(df['Category']))

In [None]:
# df['Reviews'].astype(int) #error in direct conversion

- since there is an error in direct conversion, there must be entries with non-integer values

In [None]:
import re
p = re.compile('[a-zA-Z]+')
t = df['Reviews'].apply(lambda x: p.search(str(x))!=None)
np.where(t)

- there is one entry with a non-integer value at row 10372

In [None]:
df.iloc[10472]

- we see that there is no category for this app, and all cells have been shifted one column left
- we went to https://play.google.com/store/apps/details?id=com.lifemade.internetPhotoframe to find the actual category and updated the original data

In [None]:
df.loc[10472] = ['Life Made WI-Fi Touchscreen Photo Frame','LIFESTYLE', 1.9, '19', '3.0M', '1000+', 
                 'Free', '0', 'Everyone', 'NaN', 'February 11, 2018', '1.0.19', '4.0 and up']

- now we can convert number of reviews to int successfully

In [None]:
df['Reviews'] = df['Reviews'].astype(int) 
df.info()

#### 2. convert size to int

- remove ',' and 'M', 'k' from app size and convert to common unit Megabyte (divide k by 1000)

In [None]:
df['Size'] = df['Size'].apply(lambda x: re.sub(',*M*','',str(x)))
df['Size'] = df['Size'].apply(lambda x: float(re.sub('k*','',str(x)))/1000 if 'k' in str(x) else x)

- remove apps with variable size and save in new df for future use

In [None]:
p = re.compile('[0-9.]+')
non_num_df = df['Size'].apply(lambda x: p.search(str(x))==None)
apps_with_var_size = df[non_num_df]
print(len(apps_with_var_size))
apps_with_var_size.head()

- disinclude apps with var size, and convert convert app size to float

In [None]:
df = df[non_num_df==False]
df['Size'] = df['Size'].astype(float)
df.info()

#### 3. convert num of installs to int

In [None]:
df['Installs'] = df['Installs'].apply(lambda x: re.sub(',*\+*','',str(x)))
df['Installs'] = df['Installs'].astype(int)
df.info()

#### 4. convert price to float

In [None]:
df['Price'] = df['Price'].apply(lambda x: str(x).strip('$'))
df['Price'] = df['Price'].astype(float)
df.info()

# Prediction Model

<li>Data Preprocessing

In [None]:
df = df.dropna(axis=0,how='any')

<li>convert category to int

In [None]:
category_dict = {}
category_num = 0
for category in df['Category'].unique():
    category_dict[category] = category_num
    category_num += 1

df['Category_num'] = df['Category'].apply(lambda x: str(category_dict[x]))

In [None]:
df['Genres'] = df['Genres'].apply(lambda x: str(x).split(';'))
t2 = df['Genres'].apply(lambda x: len(x)>2)
df[t2]
df['Genre1'] = df['Genres'].apply(lambda x: x[0])

In [None]:
genre_dict = {}
genre_num = 0
for genre in df['Genre1'].unique():
    genre_dict[genre] = genre_num
    genre_num += 1

df['Genre_num'] = df['Genre1'].apply(lambda x: str(genre_dict[x]))  

<li>get dummies

In [None]:
category_df = df['Category_num'].str.get_dummies()

In [None]:
genre_df = df['Genre_num'].str.get_dummies()

<li>log

In [None]:
df['Install_log'] = np.log(df['Installs'])

In [None]:
df['Reviews_log'] = np.log(df['Reviews'])

<li>identify input and output

In [None]:
X = df[['Size', 'Price']].join(category_df)
Y = df['Rating']

<li>split into train set and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.25,random_state=1)

<li>build a model

In [None]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score

def build_model():
    model = keras.Sequential([
        keras.layers.Dense(32, activation=tf.nn.relu,
                          input_shape = (X_train.shape[1],)),
        keras.layers.Dense(32, activation=tf.nn.relu),
        keras.layers.Dense(1,activation='linear')
    ])
    optimizer = tf.train.RMSPropOptimizer(0.001)
    model.compile(loss='mse',
                  optimizer=optimizer,
                  metrics=['mae'])
    return model

model = build_model()
#model.fit(X_train, Y_train, epochs=80, batch_size=16, verbose=0)
model.fit(X_train,Y_train,epochs=80, batch_size=16, verbose=0)
score = model.evaluate(X_test,Y_test)
print('score: ',score)

In [None]:
pre = model.predict(X_test)
plt.scatter(pre,Y_test)
plt.xlabel('pred')
plt.ylabel('Y_test')
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train,Y_train)
print(linreg.intercept_)
print(linreg.coef_)

In [None]:
y_pred = linreg.predict(X_test)
from sklearn import metrics

print("MSE:", metrics.mean_squared_error(Y_test, y_pred))

In [None]:
pre_df=pd.DataFrame({'Actual':Y_test, 'Predicted':y_pred})
pre_df

In [None]:
fig, ax = plt.subplots()
ax.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'k--', lw=4)
ax.scatter(y_pred, Y_test)
plt.xlabel('pred')
plt.ylabel('Y_test')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
predicted = cross_val_predict(linreg,X,Y,cv=10)
ax.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)
ax.scatter(predicted, Y)
plt.xlabel('pred')
plt.ylabel('Y')
plt.show()

In [None]:
print("MSE:", metrics.mean_squared_error(Y, predicted))

In [None]:
from sklearn.tree import DecisionTreeRegressor  
regressor = DecisionTreeRegressor()  
regressor.fit(X_train, Y_train)
y_pred = regressor.predict(X_test)  

In [None]:
from sklearn import metrics

print("MSE:", metrics.mean_squared_error(Y_test, y_pred))

In [None]:
fig, ax = plt.subplots()
ax.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'k--', lw=4)
ax.scatter(y_pred, Y_test)
plt.xlabel('pred')
plt.ylabel('Y_test')
plt.show()

In [None]:
tree_pre_df=pd.DataFrame({'Actual':Y_test, 'Predicted':y_pred}) 

In [None]:
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
predicted = cross_val_predict(regressor,X,Y,cv=10)
ax.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)
ax.scatter(predicted, Y)
plt.xlabel('pred')
plt.ylabel('Y')
plt.show()

In [None]:
print("MSE:", metrics.mean_squared_error(Y, predicted))