In [458]:
import pandas as pd
import numpy as np
import os
import glob
import sys
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# User rating for the current application verrsion
TARGET_COLUMN = "user_rating_ver"
TEST_SIZE = 50

In [459]:
# Read the CSV from the project root directory
csvs = glob.glob(os.getcwd()+"/*.csv")
df_appstore = pd.read_csv(csvs[0])
df_appstore_description = pd.read_csv(csvs[1])

In [460]:
# Merge the 2 dataframes based on the common column values
df_apple = pd.merge(df_appstore, df_appstore_description, on=['id','track_name', 'size_bytes'], how='left')
df_apple  = df_apple[list(df_apple.columns.values)[1:]]

In [461]:
# Enumerate the genre 
genre_dict = {'prime_genre':[], 'genre_id':[]}
for x, y in enumerate(list(df_apple['prime_genre'].unique())):
    genre_dict['genre_id'].append(x)
    genre_dict['prime_genre'].append(y)
df_genre = pd.DataFrame().from_dict(genre_dict)
df_apple = pd.merge(df_apple, df_genre, on='prime_genre', how='left')

In [462]:
# Count the number of words in the description, outside chance that app description can correlate to predictor
desc_count = []
for idx, data in df_apple.iterrows():
    desc_count.append(len(list(data['app_desc'])))
df_apple['desc_count'] = pd.Series(desc_count)

In [463]:
# Enumerate the content rating column of the dataset
cont_rating_dict = {'cont_rating':[], 'cont_rating_id':[]}
for x, y in enumerate(list(df_apple['cont_rating'].unique())):
    cont_rating_dict['cont_rating_id'].append(x)
    cont_rating_dict['cont_rating'].append(y)
df_cont_rating = pd.DataFrame().from_dict(cont_rating_dict)
df_apple = pd.merge(df_apple, df_cont_rating, on='cont_rating', how='left')

In [464]:
# Remove string columns and unwanted categorical variable columnns
df_apple_feat = df_apple.drop(['id', 'track_name','currency', 'ver', 
                               'cont_rating', 'prime_genre', 'vpp_lic', 
                               'app_desc'] , 1)
# shuffle the dataframe
df_apple_feat = df_apple_feat.reindex(np.random.permutation(df_apple_feat.index))

In [465]:
print("spliting the dataset into test and train...")

# Split the dataset into test and train set
total_len = len(df_apple_feat.index)
train_size = total_len - TEST_SIZE
test_size = TEST_SIZE

all_features_column = list(df_train.columns.values)
target_column = TARGET_COLUMN
training_columns = all_features_column
del training_columns[training_columns.index(target_column)]


df_x = df_apple_feat[training_columns]
df_y = df_apple_feat[target_column]

X = df_x.as_matrix()
Y = df_y.as_matrix()

print("normalizing the dataset...")

# Normalize the training dataset
scaler = StandardScaler()
# fit the data
scaler.fit(X)
# find the transformed data
X_transformed = scaler.transform(X)

# train dataframe
X_train = X_transformed[0:train_size, :]
Y_train = Y[0:train_size]

# test dataframe
X_test = X_transformed[train_size:, :]
Y_test = Y[train_size:]

print("training column:",training_columns)
print("target columns:", target_column)

print("train set:", X_train.shape[0], "test set:", X_test.shape[0])

spliting the dataset into test and train...
normalizing the dataset...
training column: ['size_bytes', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'genre_id', 'desc_count', 'cont_rating_id']
target columns: user_rating_ver
train set: 7147 test set: 50


In [466]:
print("training...")
reg = linear_model.Lasso(alpha = 0.1)
reg.fit(X_train, Y_train)

print("testing...")
Y_predicted = reg.predict(X_test)
mse_error = mean_squared_error(Y_test, Y_predicted.round(1))

print("Mean Squared Error value:", round(mse_error, 3))

training...
testing...
Mean Squared Error value: 1.43


In [467]:
for yt, yp in list(zip(Y_test, Y_predicted)):
    print("Target:", yt, "Predicted:", yp.round(1))

Target: 4.5 Predicted: 4.1
Target: 0.0 Predicted: 2.3
Target: 0.0 Predicted: 0.3
Target: 5.0 Predicted: 4.1
Target: 4.5 Predicted: 4.0
Target: 5.0 Predicted: 3.6
Target: 0.0 Predicted: 0.3
Target: 4.0 Predicted: 4.1
Target: 0.0 Predicted: 0.2
Target: 4.0 Predicted: 3.3
Target: 4.5 Predicted: 4.1
Target: 4.5 Predicted: 4.1
Target: 3.0 Predicted: 3.3
Target: 3.0 Predicted: 3.6
Target: 1.5 Predicted: 3.1
Target: 4.0 Predicted: 4.1
Target: 5.0 Predicted: 4.1
Target: 5.0 Predicted: 4.1
Target: 4.0 Predicted: 3.6
Target: 4.0 Predicted: 3.7
Target: 0.0 Predicted: 3.6
Target: 4.5 Predicted: 4.1
Target: 0.0 Predicted: 4.5
Target: 5.0 Predicted: 4.5
Target: 3.0 Predicted: 3.3
Target: 3.5 Predicted: 3.2
Target: 4.5 Predicted: 4.0
Target: 4.5 Predicted: 4.0
Target: 3.5 Predicted: 3.3
Target: 4.5 Predicted: 4.0
Target: 4.5 Predicted: 4.1
Target: 4.5 Predicted: 3.7
Target: 5.0 Predicted: 4.1
Target: 4.0 Predicted: 4.0
Target: 4.5 Predicted: 3.7
Target: 4.0 Predicted: 3.1
Target: 1.0 Predicted: 4.5
T