## This Project is to develop a model suitable for predicting the approximate number of installs/downloads of the app.
Features used include:
* Category of App
* Size of App
* Type of App
* Price of App
* Catent_rating of the App

By: `Selorm Komla Darkey`

In [70]:
import pandas as pd
import pickle 
import mlflow
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [71]:
# mlflow details
# set the tracking uri (location to store the tracking data)
mlflow.set_tracking_uri("sqlite:///apps_backend.db")

# create / name the experiment 
mlflow.set_experiment("andriod-apps-ml-exp")


<Experiment: artifact_location='file:///d:/STUDY_RESOURCES_BASE/MLOps/MLOps_Zoomcamp/FROM_CODESPACE/andriod_app_project/mlruns/1', creation_time=1719179488244, experiment_id='1', last_update_time=1719179488244, lifecycle_stage='active', name='andriod-apps-ml-exp', tags={}>

In [72]:
data = r"D:\STUDY_RESOURCES_BASE\MLOps\MLOps_Zoomcamp\FROM_CODESPACE\andriod_app_project\Google_Play_Store_Apps_Analytics.csv"

df_apps = pd.read_csv(data)

In [73]:
# droping columns with NaN Values in the data set.
df_clean = df_apps.drop_duplicates(subset=['App', 'Type', 'Price'])

print(df_clean.head())


                                            App         Category  Rating  \
0                       Ak Parti Yardım Toplama           SOCIAL     NaN   
1                    Ain Arabic Kids Alif Ba ta           FAMILY     NaN   
2  Popsicle Launcher for Android P 9.0 launcher  PERSONALIZATION     NaN   
3                     Command & Conquer: Rivals           FAMILY     NaN   
4                                    CX Network         BUSINESS     NaN   

   Reviews  Size_MBs Installs  Type   Price Content_Rating           Genres  \
0        0       8.7        0  Paid  $13.99           Teen           Social   
1        0      33.0        0  Paid   $2.99       Everyone        Education   
2        0       5.5        0  Paid   $1.49       Everyone  Personalization   
3        0      19.0        0   NaN       0   Everyone 10+         Strategy   
4        0      10.0        0  Free       0       Everyone         Business   

     Last_Updated         Android_Ver  
0   July 28, 2017          4

In [74]:
print(len(df_clean))

9662


In [75]:
df_clean = df_clean.dropna()

print(len(df_clean))

8196


In [76]:
df_clean.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size_MBs', 'Installs', 'Type',
       'Price', 'Content_Rating', 'Genres', 'Last_Updated', 'Android_Ver'],
      dtype='object')

In [77]:
# create train and test data
X = df_clean.drop(labels=['App', 'Installs', 'Genres', 'Rating', 'Reviews', 'Last_Updated', 'Android_Ver'], axis=1)
y = df_clean['Installs']


In [78]:
X.head()

Unnamed: 0,Category,Size_MBs,Type,Price,Content_Rating
21,MEDICAL,25.0,Free,0,Everyone
28,GAME,20.0,Paid,$1.49,Everyone
47,GAME,16.0,Paid,$0.99,Everyone
82,GAME,19.0,Free,0,Everyone
99,MEDICAL,4.6,Free,0,Everyone


In [79]:
y.head()

21    1
28    1
47    1
82    5
99    5
Name: Installs, dtype: object

In [80]:
# create train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42 )


In [81]:
X_train.head()

Unnamed: 0,Category,Size_MBs,Type,Price,Content_Rating
2680,FAMILY,69.0,Free,0,Everyone
9524,FAMILY,5.7,Free,0,Everyone
3752,PARENTING,18.0,Free,0,Everyone
9887,FINANCE,53.0,Free,0,Everyone
6762,FINANCE,6.1,Free,0,Everyone


In [82]:
y_train = y_train.astype(str).str.replace(',','')
y_train = pd.to_numeric(y_train)
y_train

2680        1000
9524    10000000
3752       10000
9887    10000000
6762     1000000
          ...   
5333      100000
6873     1000000
7099     1000000
2087        1000
9395    10000000
Name: Installs, Length: 5737, dtype: int64

In [83]:
# fields to encode 
categorical = ['Category', 'Type', 'Content_Rating']
numerical = ['Size_MBs', 'Price']

In [84]:
X_train[categorical] = X_train[categorical].astype(str)

In [85]:
# pipeline with 2 steps to avoid saving dv separate
pipeline = make_pipeline(
    DictVectorizer(),
    LinearRegression()
)


In [86]:
# start tracking the model in a new run
with mlflow.start_run():
    # add info to log
    mlflow.set_tag("developer", "selorm")

    # tracking dataset (basic approach)
    mlflow.log_param("train_data-path", ".Google_Play_Store_Apps_Analytics.csv")
    
    # training 
    train_dicts = X_train[categorical + numerical].to_dict(orient='records')

    y_train_ = y_train.values

    X_train = pipeline.fit(train_dicts, y_train_)  # use this instead of model.fit

    y_pred = pipeline.predict(train_dicts)

    rmse = mean_squared_error(y_train_, y_pred, squared=False) 
    mlflow.log_metric("rmse", rmse)    # logging the metrics

    with open('./models/numinstalls_lin_reg2.bin', 'wb') as f_out:
        pickle.dump(pipeline, f_out)

    # saving the model as an artifact
    mlflow.log_artifact(local_path="./models/numinstalls_lin_reg2.bin", 
                        artifact_path="models_linreg") # location of the model, and where the artefact would be stored
    
    # return {'loss': rmse, 'status': STATUS_OK}




## Adding a RandomForest Model to compare.

In [98]:
mlflow.set_experiment("random-forest-hyperopt")

<Experiment: artifact_location='file:///d:/STUDY_RESOURCES_BASE/MLOps/MLOps_Zoomcamp/FROM_CODESPACE/andriod_app_project/mlruns/2', creation_time=1719445223758, experiment_id='2', last_update_time=1719445223758, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>

In [99]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [100]:
# create train and test data
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, train_size=0.7, random_state=42 )

In [101]:
# remove commas and turn to numeric
y_train_rf = y_train_rf.astype(str).str.replace(',','')
y_train_rf = pd.to_numeric(y_train_rf)

y_test_rf = y_test_rf.astype(str).str.replace(',','')
y_test_rf = pd.to_numeric(y_test_rf)

In [102]:
# turn categorical to object
X_train_rf[categorical] = X_train_rf[categorical].astype(str)
X_test_rf[categorical] = X_test_rf[categorical].astype(str)


In [103]:
# creating the training and val datasets and one-hot encode with dv

dv = DictVectorizer()

X_train_dicts = X_train_rf[categorical + numerical].to_dict(orient='records')
X_train_rft = dv.fit_transform(train_dicts)

val_dicts = X_test_rf[categorical + numerical].to_dict(orient='records')
X_val_rft = dv.transform(val_dicts)

y_train_rf = y_train_rf.values

y_val_rf = y_test_rf.values



In [104]:
# hyperparameter searching for randomforest


def run_optimization(X_train, y_train, X_val, y_val, num_trials: int):

    def objective(params):
        with mlflow.start_run():
            # log the parameters
            mlflow.log_params(params)

            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)
            rmse = mean_squared_error(y_val, y_pred, squared=False)

            # log the metric with mlflow
            mlflow.log_metric("rmse", rmse)

            return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }

    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )

In [None]:
# run the optimization algorithm
run_optimization(X_train=X_train_rft, 
                 y_train=y_train_rf, 
                 X_val=X_val_rft, 
                 y_val=y_val_rf, 
                 num_trials=50)

In [108]:
# best parameters logged from the parameter search
best_params = {
    "max_depth" : 12,
    "min_samples_leaf": 1,
    "min_samples_split" : 8,
    "n_estimators" : 39,
    "random_state" : 42}

Training a random forest model on the best parameters

In [109]:
# Creating a pipeline with dv and RandomForestRegressor 
pipeline_rf = make_pipeline(
    DictVectorizer(),
    RandomForestRegressor(**best_params, n_jobs=-1)
)

In [110]:
# defining the training function
def train_rf(X_train, y_train, X_val, y_val):
    with mlflow.start_run():

        mlflow.sklearn.autolog()
    
        pipeline.fit(X_train, y_train)  # use this instead of model.fit

        y_pred = pipeline.predict(X_val)

        rmse = mean_squared_error(y_val, y_pred, squared=False)


In [113]:
# train rf model with the best params
train_dicts = X_train_rf[categorical + numerical].to_dict(orient='records')
val_dicts = X_test_rf[categorical + numerical].to_dict(orient='records')

# y_train_rf = y_train_rf.values

# y_val_rf = y_test_rf.values

In [114]:
# training the rf model

train_rf(
    X_train=train_dicts,
    y_train=y_train_rf,
    X_val=val_dicts,
    y_val=y_val_rf,
)





* full path to model:
`file:///d:/STUDY_RESOURCES_BASE/MLOps/MLOps_Zoomcamp/FROM_CODESPACE/andriod_app_project/mlruns/2/f16f087a2f7b4bf8b1b5ec662cda23d4/artifacts/model`

* using the pyfunc inferencing method:

``import mlflow``
``logged_model = 'runs:/f16f087a2f7b4bf8b1b5ec662cda23d4/model'``

* Load model as a PyFuncModel.
``loaded_model = mlflow.pyfunc.load_model(logged_model)``

* Predict on a Pandas DataFrame.
``import pandas as ``pd``
``loaded_model.predict(pd.DataFrame(data))``