## Importing Libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Data Cleaning

In [None]:
data = pd.read_csv('../input/google-play-store-apps/googleplaystore.csv')

In [None]:
data.head(10)

In [None]:
# Removing irrelevant features
data = data.drop(['App','Last Updated','Current Ver','Android Ver'],axis='columns')

In [None]:
data.head(10)

In [None]:
# checking for null values
data.isna().sum()

In [None]:
# drop the entire record if null value is present in 'any' of the feature
data.dropna(how='any',inplace=True)

In [None]:
data.shape

In [None]:
data.isna().sum()

In [None]:
data.dtypes

In [None]:
# changing the datatype of Review column from integer from object
data = data.astype({'Reviews':'int'})

In [None]:
data.Size.value_counts().head()

In [None]:
data.Size.value_counts().tail()

In [None]:
# Replacing 'Varies with device' value with Nan values
data['Size'].replace('Varies with device', np.nan, inplace = True ) 

In [None]:
# Removing the suffixes (k and M) and representing all the data as bytes 
# (i.e)for k, value is multiplied by 100 and for M, the value is multiplied by 1000000 
data.Size = (data.Size.replace(r'[kM]+$', '', regex=True).astype(float) * \
             data.Size.str.extract(r'[\d\.]+([KM]+)', expand=False).fillna(1)
            .replace(['k','M'], [10**3, 10**6]).astype(int))

In [None]:
# filling "Varies with device" with mean of size in each category
data['Size'].fillna(data.groupby('Category')['Size'].transform('mean'),inplace = True)

In [None]:
# Removing comma(,) and plus(+) signs
data.Installs = data.Installs.apply(lambda x: x.replace(',',''))
data.Installs = data.Installs.apply(lambda x: x.replace('+',''))

In [None]:
# changing the datatype from object to integer
data = data.astype({'Installs':'int'})

In [None]:
data.Price.value_counts()

In [None]:
# Removing dollar($) sign and changing the type to float
data.Price = data.Price.apply(lambda x: x.replace('$',''))
data['Price'] = data['Price'].apply(lambda x: float(x))

In [None]:
data.Genres.value_counts().tail()

Many genre contain only few record, it may make a bias.
Then, I decide to group it to bigger genre by ignore sub-genre (after " ; " sign)

In [None]:
data['Genres'] = data.Genres.str.split(';').str[0]

In [None]:
data.Genres.value_counts()

In [None]:
# Group Music & Audio as Music
data['Genres'].replace('Music & Audio', 'Music',inplace = True)

In [None]:
data['Content Rating'].value_counts()

In [None]:
# Removing the entire row from the data where content rating is unrated as there is only one row
data = data[data['Content Rating'] != 'Unrated']

In [None]:
data.dtypes

## Data Preprocessing

In [None]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
column_trans = make_column_transformer(
                (OneHotEncoder(),['Category','Installs','Type','Content Rating','Genres']),
                (StandardScaler(),['Reviews','Size','Price']),
                remainder = 'passthrough')

## Train Test Split

In [None]:
# Choosing X and y value
X = data.drop('Rating',axis='columns')
y = data.Rating

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [None]:
column_trans.fit_transform(X_train)

## Regression Models

### 1. Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
linreg = LinearRegression()
pipe = make_pipeline(column_trans,linreg)

In [None]:
from sklearn.model_selection import cross_validate
linreg_score = cross_validate(pipe, X_train, y_train, cv=10, scoring=['neg_mean_squared_error','neg_mean_absolute_error'],return_train_score=False)
print('Mean Absolute Error: {}'.format(linreg_score['test_neg_mean_absolute_error'].mean()))
print('Mean Squared Error: {}'.format(linreg_score['test_neg_mean_squared_error'].mean()))
print('Root Mean Squared Error: {}'.format(np.sqrt(-linreg_score['test_neg_mean_squared_error'].mean())))

### 2. Support Vector Regressor (SVR)

In [None]:
from sklearn.svm import SVR
svr = SVR()
pipe = make_pipeline(column_trans,svr)
svr_score = cross_validate(pipe, X_train, y_train, cv=10, scoring=['neg_mean_squared_error','neg_mean_absolute_error'],return_train_score=False)
print('Mean Absolute Error: {}'.format(svr_score['test_neg_mean_absolute_error'].mean()))
print('Mean Squared Error: {}'.format(svr_score['test_neg_mean_squared_error'].mean()))
print('Root Mean Squared Error: {}'.format(np.sqrt(-svr_score['test_neg_mean_squared_error'].mean())))

### 3. Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(n_estimators=100, max_features=3, min_samples_leaf=10)
pipe = make_pipeline(column_trans,forest_model)
rfr_score = cross_validate(pipe, X_train, y_train, cv=10, scoring=['neg_mean_squared_error','neg_mean_absolute_error'],return_train_score=False)
print('Mean Absolute Error: {}'.format(rfr_score['test_neg_mean_absolute_error'].mean()))
print('Mean Squared Error: {}'.format(rfr_score['test_neg_mean_squared_error'].mean()))
print('Root Mean Squared Error: {}'.format(np.sqrt(-rfr_score['test_neg_mean_squared_error'].mean())))

## Testing on Test Set

### 1. Linear Regression

In [None]:
pipe = make_pipeline(column_trans,linreg)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
print('Mean Absolute Error: {}'.format(mean_absolute_error(y_pred,y_test)))
print('Mean Squared Error: {}'.format(mean_squared_error(y_pred,y_test)))
print('Root Mean Squared Error: {}'.format(np.sqrt(mean_absolute_error(y_pred,y_test))))

### 2. Support Vector Regressor

In [None]:
pipe = make_pipeline(column_trans,svr)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [None]:
print('Mean Absolute Error: {}'.format(mean_absolute_error(y_pred,y_test)))
print('Mean Squared Error: {}'.format(mean_squared_error(y_pred,y_test)))
print('Root Mean Squared Error: {}'.format(np.sqrt(mean_absolute_error(y_pred,y_test))))

### 3. Random Forest Regressor

In [None]:
pipe = make_pipeline(column_trans,forest_model)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [None]:
print('Mean Absolute Error: {}'.format(mean_absolute_error(y_pred,y_test)))
print('Mean Squared Error: {}'.format(mean_squared_error(y_pred,y_test)))
print('Root Mean Squared Error: {}'.format(np.sqrt(mean_absolute_error(y_pred,y_test))))