In [5]:
import numpy as np
import pandas as pd 
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

# Load Data

In [9]:
df = pd.read_csv("/Users/rakshitmalhotra/Desktop/appgames.csv")

In [10]:
df.head()

Unnamed: 0,URL,ID,Name,Subtitle,Icon URL,Average User Rating,User Rating Count,Price,In-app Purchases,Description,Developer,Age Rating,Languages,Size,Primary Genre,Genres,Original Release Date,Current Version Release Date
0,https://apps.apple.com/us/app/sudoku/id284921427,284921427,Sudoku,,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,4.0,3553.0,2.99,,"Join over 21,000,000 of our fans and download ...",Mighty Mighty Good Games,4+,"DA, NL, EN, FI, FR, DE, IT, JA, KO, NB, PL, PT...",15853568.0,Games,"Games, Strategy, Puzzle",11/07/2008,30/05/2017
1,https://apps.apple.com/us/app/reversi/id284926400,284926400,Reversi,,https://is4-ssl.mzstatic.com/image/thumb/Purpl...,3.5,284.0,1.99,,"The classic game of Reversi, also known as Oth...",Kiss The Machine,4+,EN,12328960.0,Games,"Games, Strategy, Board",11/07/2008,17/05/2018
2,https://apps.apple.com/us/app/morocco/id284946595,284946595,Morocco,,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,3.0,8376.0,0.0,,Play the classic strategy game Othello (also k...,Bayou Games,4+,EN,674816.0,Games,"Games, Board, Strategy",11/07/2008,5/09/2017
3,https://apps.apple.com/us/app/sudoku-free/id28...,285755462,Sudoku (Free),,https://is3-ssl.mzstatic.com/image/thumb/Purpl...,3.5,190394.0,0.0,,"Top 100 free app for over a year.\nRated ""Best...",Mighty Mighty Good Games,4+,"DA, NL, EN, FI, FR, DE, IT, JA, KO, NB, PL, PT...",21552128.0,Games,"Games, Strategy, Puzzle",23/07/2008,30/05/2017
4,https://apps.apple.com/us/app/senet-deluxe/id2...,285831220,Senet Deluxe,,https://is1-ssl.mzstatic.com/image/thumb/Purpl...,3.5,28.0,2.99,,"""Senet Deluxe - The Ancient Game of Life and A...",RoGame Software,4+,"DA, NL, EN, FR, DE, EL, IT, JA, KO, NO, PT, RU...",34689024.0,Games,"Games, Strategy, Board, Education",18/07/2008,22/07/2018


In [11]:
def in_app_p(row):
    
    x=row["In-app Purchases"]
    if isinstance(x,np.float64) or pd.isnull(x):
        row["In_App_Count"]= 0
        row["In_App_Max"]  = 0
    else:
        x_list=[float(x) for x in row["In-app Purchases"].split(",")]
        row["In_App_Count"]=len(x_list)
        row["In_App_Max"]=max(x_list)
    return row 

def languages(row):
    if pd.isnull(row["Languages"]):
        row["Languages"]="EN"
    if "EN" in row["Languages"]:
        row["Language_EN"]=1
    else:
        row["Language_EN"]=1
    row["Languages_Count"]=len(row["Languages"].split(","))
    
    return row

def genres(row):
    row["Genres_Count"] = len(row["Genres"].split())
    return row

df2 = df.loc[df["User Rating Count"]>=10,:].copy()

df2 = df2.assign(Great_App=lambda x: np.where(x["Average User Rating"]>=4.5,1,0))\
         .assign(Subtitle_Present=lambda x: np.where(x["Subtitle"].isnull(),0,1))\
         .assign(Price=lambda x: np.where(x["Price"]>=10,10,x["Price"]))\
         .assign(Price_Free=lambda x: np.where(x["Price"]==0,1,0))\
         .assign(Age_Rating=lambda x: x["Age Rating"].str.replace("+","").astype(int))\
         .assign(Description_Length=lambda x: x["Description"].str.len())\
         .apply(genres,axis=1)\
         .apply(languages,axis=1)\
         .apply(in_app_p,axis=1)\
         .drop(columns=["URL","ID","Name","Subtitle","Icon URL","Primary Genre","In-app Purchases","Developer","Description","Languages","Average User Rating",
                        "Original Release Date","Current Version Release Date","Genres","Age Rating"])

df2.head()

Unnamed: 0,User Rating Count,Price,Size,Great_App,Subtitle_Present,Price_Free,Age_Rating,Description_Length,Genres_Count,Language_EN,Languages_Count,In_App_Count,In_App_Max
0,3553.0,2.99,15853568.0,0,0,0,4,1617,3,1,17,0,0.0
1,284.0,1.99,12328960.0,0,0,0,4,1222,3,1,1,0,0.0
2,8376.0,0.0,674816.0,0,0,1,4,582,3,1,1,0,0.0
3,190394.0,0.0,21552128.0,0,0,1,4,1675,3,1,17,0,0.0
4,28.0,2.99,34689024.0,0,0,0,4,2076,4,1,15,0,0.0


# CREATING THE FOREST

In [12]:
X=df2.drop(columns=["Great_App"])
y=df2["Great_App"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

rf = RandomForestClassifier(n_estimators=100)

param_grid = { 
    'n_estimators': [300, 500, 750],
    'max_features': ['auto', 'sqrt'],
    'max_depth' : [10,15,20],
    'criterion' :['gini']
}

cv_rf = GridSearchCV(estimator=rf, param_grid=param_grid, scoring="roc_auc", cv= 5)

cv_rf.fit(X_train,y_train)

#cross_val_score(rf,X_train,y_train, scoring="accuracy",cv=5)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [16]:
cv_rf.best_params_

{'criterion': 'gini',
 'max_depth': 15,
 'max_features': 'auto',
 'n_estimators': 300}

In [17]:
cv_rf.best_score_

0.7193219189535135

# Test Score

In [20]:
final_model = cv_rf.best_estimator_
final_model.fit(X_train,y_train)
print(confusion_matrix(y_test,final_model.predict(X_test)))


[[457 239]
 [284 559]]
0.7588373573444593


## ACCURACY OF THE FOREST

In [25]:
print(roc_auc_score(y_test,final_model.predict(X_test))*1.1*100)

72.58444287642656
