In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
%matplotlib inline
import os

df = pd.read_csv(os.path.join("/kaggle/input/", "google-play-store-apps/googleplaystore.csv"))

In [None]:
#######################################################################################################################################################
######                                                        DATA CLEANING                                                                      ######
#######################################################################################################################################################

# Function to convert file sizes into float values corresponding to 'M' (does mean very small files will have size 0.0)
def convertFileSize(data):
    if "M" in data:
        data = data.replace('M', '')
        return float(data)
    else:
        data = data.replace('k', '') # Presuming 'k' is Kilobytes and 'M' is Megabytes
        return float(data) // 1000
    
no_null_df = df.dropna() # Removes all rows which have at least one NaN value.
no_varies_df = no_null_df[(no_null_df['Size'] != 'Varies with device') & (no_null_df['Current Ver'] != 'Varies with device') & (no_null_df['Android Ver'] != 'Varies with device')] # Removes all rows which have a 'Varies with device' entry.

no_varies_df.loc[:,('Size')] = no_varies_df.loc[:,('Size')].map(convertFileSize) # Maps the function defined above to the column so every size gets converted to a numerical value.
no_varies_df.loc[:,'Installs'] = no_varies_df.loc[:,'Installs'].str.replace(r'\D', '').astype(int) # Removes the '+' from Installs column and converts the value to an int for easier manipulation. 
no_varies_df.loc[:,'Android Ver'] = no_varies_df.loc[:,'Android Ver'].str.replace(' and up', '') # Removes the ' and up' from Android Ver column.
no_varies_df.loc[:,'Genres'] = no_varies_df.loc[:,'Genres'].str.replace('Education;Education', 'Education') # Removes the duplicate education tag from Genre column (ie 'Education;Education').

no_varies_df['Reviews'] = no_varies_df['Reviews'].astype('int64') # Convert reviews to int64
no_varies_df['Last Updated'] = pd.to_datetime(no_varies_df['Last Updated']).astype('int64') // 1e9 # Convert last updated string to datetime
no_varies_df['Type'] = no_varies_df['Type'] == 'Paid' # Convert to boolean
no_varies_df['Price'] = no_varies_df['Price'].replace('[\$,]', '', regex=True).astype('float64') # Convert Price to float

no_varies_df = no_varies_df.drop(columns=['App', 'Current Ver','Android Ver', 'Genres', 'Last Updated']) # Drop unrelated items

clean_df = no_varies_df # Final data frame to be manipulated.

In [None]:
clean_df.info()

In [None]:
clean_df.head()

In [None]:
random_state = 42 # a seed for random number in the following functions

In [None]:
X, y = clean_df.drop(columns=['Rating']), clean_df['Rating']

In [None]:
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=random_state)

In [None]:
X_train_valid

In [None]:
enc_X_train_valid = pd.get_dummies(X_train_valid) # encode categorical variables into binary values
enc_X_test = pd.get_dummies(X_test) # to be used in testing
enc_X_test = enc_X_test.reindex(labels=enc_X_train_valid.columns,axis=1)
enc_X_train_valid

In [None]:
std = StandardScaler() # standardize method
std_X_train_valid = std.fit_transform(enc_X_train_valid) # standardize training
std_X_test = std.transform(enc_X_test) # to be used in testing
std_X_test = np.nan_to_num(std_X_test) # get rid of nan to 0

In [None]:
pca = PCA(random_state=random_state) # PCA method
pca_X_train_valid = pca.fit_transform(std_X_train_valid)  # pca for training
pca_X_test = pca.transform(std_X_test)  # pca for testing

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))

In [None]:
plt.plot(pca.explained_variance_ratio_)

In [None]:
dimension = 40
pca = PCA(n_components=dimension, random_state=random_state) # PCA method
pca_X_train_valid = pca.fit_transform(std_X_train_valid)  # pca for training
pca_X_test = pca.transform(std_X_test)  # pca for testing

In [None]:
plt.plot(pca.explained_variance_ratio_)

In [None]:
pca.explained_variance_ratio_

In [None]:
std_y = StandardScaler()  # standardize method
std_y_train_valid = std_y.fit_transform(y_train_valid.to_numpy().reshape(-1, 1))  # standardize training
std_y_test = std_y.transform(y_test.to_numpy().reshape(-1, 1))  # standardize testing

In [None]:
from sklearn.linear_model import LinearRegression 
from sklearn import linear_model
import seaborn as sns

reg = LinearRegression()
reg.fit(pca_X_train_valid, std_y_train_valid)
results = reg.predict(pca_X_test)

plt.figure(figsize=(12,7))
sns.regplot(results, std_y_test, color='teal', marker = 'x')
plt.title('Linear regression')
plt.xlabel('Predicted Ratings')
plt.ylabel('Actual Ratings')
plt.show()

In [None]:
reg.score(pca_X_test, std_y_test)