In [167]:
# Load Required Libraries
%matplotlib inline 
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np 

import seaborn as sns   # Heatmap plot
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Algorithm Models to import
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

# All performance Metrics
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from imblearn.over_sampling import SMOTE


In [168]:
# Get Error Type
def get_error_type(pred, label):
    # return the type of error: tp,fp,tn,fn
    if pred == label:
        return "tp" if pred == 1 else "tn"
    return "fp" if pred == 1 else "fn"  

# Custom function to get confusion Matrix     
def get_custom_confusionMatrix(predicted, truelabel):
    confusion_mat = np.zeros((2, 2))
    
    for k in range(0,len(predicted)):
        val = get_error_type(predicted[k], truelabel[k])
        if val == "tp":     
            confusion_mat[0][0] = confusion_mat[0][0] + 1
        elif val == "fp":     
            confusion_mat[0][1] = confusion_mat[0][1] + 1
        elif val == "fn":         
            confusion_mat[1][0] = confusion_mat[1][0] + 1
        else:
            confusion_mat[1][1] = confusion_mat[1][1] + 1
            
    return(confusion_mat)             

In [169]:
# Read Dataset csv file 
rawdataset = pd.read_excel('Algodevelop\CleanupAlgoDevData.xlsx', sheetname='Sheet1')

rawdataset.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price
0,Photo_Editor_Candy_Camera_Grid_ScrapBook,ART_AND_DESIGN,4.1,2.201397,19.0,4.0,Free,0.0
1,Coloring_book_moana,ART_AND_DESIGN,3.9,2.985426,14.0,5.69897,Free,0.0
2,U_Launcher_Lite_FREE_Live_Cool_Themes_Hide_Apps,ART_AND_DESIGN,4.7,4.942058,8.7,6.69897,Free,0.0
3,Sketch_Draw_Paint,ART_AND_DESIGN,4.5,5.333737,25.0,7.69897,Free,0.0
4,Pixel_Draw_Number_Art_Coloring_Book,ART_AND_DESIGN,4.3,2.985426,2.8,5.0,Free,0.0


In [170]:
# Handle imbalanced data
subsetdf_Target0 = rawdataset[rawdataset['Type'] == 'Free']
subsetdf_Target1 = rawdataset[rawdataset['Type'] == 'Paid']

# Print Lenghts to determine downsampling
print(subsetdf_Target0.shape)
print(subsetdf_Target1.shape)

itemstodrop = 4400
indextodrop = np.random.permutation(itemstodrop)
subsetdf_Target0 = subsetdf_Target0.drop(subsetdf_Target0.index[indextodrop],axis='rows')
frames = [subsetdf_Target0, subsetdf_Target1]
subsetdf_NormalizedFinal = pd.concat(frames)

(8902, 8)
(734, 8)


In [171]:
subsetdf_NormalizedFinal['Category'].iloc[1:10]

5731                  TOOLS
5732          COMMUNICATION
5733                 FAMILY
5734        PERSONALIZATION
5735          VIDEO_PLAYERS
5736         FOOD_AND_DRINK
5737            PHOTOGRAPHY
5738    BOOKS_AND_REFERENCE
5740                  TOOLS
Name: Category, dtype: object

In [172]:
# Bin Category of "Category"
for i in range(0, len(subsetdf_NormalizedFinal)):
    if ('FAMILY' in subsetdf_NormalizedFinal['Category'].iloc[i]) or  ('GAME' in subsetdf_NormalizedFinal['Category'].iloc[i]) or ('BUSINESS' in subsetdf_NormalizedFinal['Category'].iloc[i]) or ('PERSONAL' in subsetdf_NormalizedFinal['Category'].iloc[i]) or ('MEDICAL' in subsetdf_NormalizedFinal['Category'].iloc[i]) :
        xx = 1
    else:
        subsetdf_NormalizedFinal['Category'].iloc[i] = "All OTHER"

subsetdf_NormalizedFinal.mean()        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Rating       4.137024
Reviews      1.479924
Size        18.135017
Installs     3.804699
Price        0.633753
dtype: float64

In [173]:
# Add Categorization Variables here
subsetdf_NormalizedFinal = pd.get_dummies(subsetdf_NormalizedFinal,columns=['Category'])

In [174]:
subsetdf_NormalizedFinal    = subsetdf_NormalizedFinal[subsetdf_NormalizedFinal["Reviews"] > 0]
subsetdf_NormalizedFinal    = subsetdf_NormalizedFinal[subsetdf_NormalizedFinal["Installs"] > 0]
subsetdf_NormalizedFinal.describe()


Unnamed: 0,Rating,Reviews,Size,Installs,Price,Category_All OTHER,Category_BUSINESS,Category_FAMILY,Category_GAME,Category_MEDICAL,Category_PERSONALIZATION
count,4770.0,4770.0,4770.0,4770.0,4770.0,4770.0,4770.0,4770.0,4770.0,4770.0,4770.0
mean,4.134273,2.406055,18.564294,4.070573,0.625449,0.512788,0.050734,0.254717,0.110482,0.032285,0.038994
std,0.572699,1.582758,20.366195,1.626136,2.673604,0.499889,0.219477,0.435748,0.313523,0.176775,0.1936
min,1.0,4.342945e-09,0.014,4.342945e-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,1.113943,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.2,2.174639,11.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,4.5,3.574494,26.0,5.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
max,5.0,7.396217,100.0,8.69897,74.99,1.0,1.0,1.0,1.0,1.0,1.0


In [175]:
# Bin Size Variable
#subsetdf_NormalizedFinal['SizBucket'] = pd.cut(subsetdf_NormalizedFinal['Size'], bins= [0,20,40,80,100], labels=["LightWt","MidWt","Heavy","Bulky"]) 
#subsetdf_NormalizedFinal = pd.get_dummies(subsetdf_NormalizedFinal,columns=['SizBucket'])

# Bin Rating Variable
#subsetdf_NormalizedFinal['RateBucket'] = pd.cut(subsetdf_NormalizedFinal['Rating'], bins= [0.01,2.5,3.5,4.5,5], labels=["POOR","AVERAGE","GOOD","EXCELLENT"])   
#subsetdf_NormalizedFinal = pd.get_dummies(subsetdf_NormalizedFinal,columns=['RateBucket'])

# Bin Review Variable
#subsetdf_NormalizedFinal['ReviewBucket'] = pd.cut(subsetdf_NormalizedFinal['Reviews'], bins= [0,3,5,6,10], labels=["LOW","AVERAGE","HIGH","MATURE"])   
#subsetdf_NormalizedFinal = pd.get_dummies(subsetdf_NormalizedFinal,columns=['ReviewBucket'])

# Bin installs 
#subsetdf_NormalizedFinal['InstallsBucket'] = pd.cut(subsetdf_NormalizedFinal['Installs'], bins= [0,3,5,6,10], labels=["LOW","AVERAGE","HIGH","MATURE"])   
#subsetdf_NormalizedFinal = pd.get_dummies(subsetdf_NormalizedFinal,columns=['InstallsBucket'])



In [176]:
# Create a Binary target Variable
TargetVar = []
for k in range(0,len(subsetdf_NormalizedFinal)):
    tempvar = 1
    if subsetdf_NormalizedFinal.iloc[k].Type == 'Free' :   #Free Apps
        tempvar = 0
    TargetVar.append(tempvar)
    
# Drop Price and App Name
subsetdf_NormalizedFinal = subsetdf_NormalizedFinal.drop(['App','Price','Type'],axis=1)    

# Add Target Variable
target_series = pd.Series(TargetVar)
target_series.index = subsetdf_NormalizedFinal.index
subsetdf_NormalizedFinal['TARGET'] = target_series

In [177]:
# Prepare Training and Testing Split Randomnly
y = subsetdf_NormalizedFinal['TARGET'].values     # Output Target
subsetdf_NormalizedFinal = subsetdf_NormalizedFinal.drop(['TARGET'],axis=1)
train_x, test_x,train_y, test_y = train_test_split(subsetdf_NormalizedFinal,y,test_size = 0.20, random_state  = 42)

In [178]:
sm = SMOTE(random_state=42, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(train_x, train_y)

In [179]:
print(subsetdf_NormalizedFinal.shape)

print(train_x.shape)
print(np.sum(train_y))

print(test_y.shape)
print(np.sum(test_y))


print(x_train_res.shape)
print(np.sum(y_train_res))

print(test_y.shape)
print(np.sum(test_y))


(4770, 10)
(3816, 10)
529
(954,)
134
(6574, 10)
3287
(954,)
134


In [180]:
# Try Random Forest Algorithm and Pick the best Parameter based on dev dataset
cs = np.arange(100, 200, 10)

train_accuracy_list = list()
dev_accuracy_list = list()

dev_recall_list = list()
dev_precision_list = list()
dev_f1score_list = list()

for c in cs:
    model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',n_estimators = c)
    model.fit(x_train_res,y_train_res)
    train_preds = model.predict(x_train_res)
    dev_preds   = model.predict(test_x)

    (train_score, dev_score) = (accuracy_score(y_train_res, train_preds), accuracy_score(test_y, dev_preds))
    
    # Find Recall and Precision Scores  
    train_score_r = recall_score(y_train_res, train_preds, pos_label=1, average="binary")
    train_score_p = precision_score(y_train_res, train_preds, pos_label=1, average="binary")
    
    dev_score_r = recall_score(test_y, dev_preds, pos_label=1, average="binary")
    dev_score_p = precision_score(test_y, dev_preds, pos_label=1, average="binary")
    dev_f1score = f1_score(test_y, dev_preds, pos_label=1, average="binary")
    
    # Collect Train and Dev Accuracy
    train_accuracy_list.append(train_score)
    dev_accuracy_list.append(dev_score)
    
    # Collect dev Precision and Recall 
    dev_recall_list.append(dev_score_r)
    dev_precision_list.append(dev_score_p)
    dev_f1score_list.append(dev_f1score)
    
    print("Dev Recall:", dev_score_r, ", Dev Precision:", dev_score_p, ",param:", c)




Dev Recall: 0.5671641791044776 , Dev Precision: 0.5507246376811594 ,param: 100
Dev Recall: 0.5671641791044776 , Dev Precision: 0.5547445255474452 ,param: 110
Dev Recall: 0.5671641791044776 , Dev Precision: 0.5588235294117647 ,param: 120
Dev Recall: 0.5522388059701493 , Dev Precision: 0.5362318840579711 ,param: 130
Dev Recall: 0.5895522388059702 , Dev Precision: 0.5642857142857143 ,param: 140
Dev Recall: 0.582089552238806 , Dev Precision: 0.5531914893617021 ,param: 150
Dev Recall: 0.5522388059701493 , Dev Precision: 0.5323741007194245 ,param: 160
Dev Recall: 0.5597014925373134 , Dev Precision: 0.5357142857142857 ,param: 170
Dev Recall: 0.5597014925373134 , Dev Precision: 0.5514705882352942 ,param: 180
Dev Recall: 0.5671641791044776 , Dev Precision: 0.5547445255474452 ,param: 190


In [181]:
clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy', n_estimators =150)
clf = clf.fit(x_train_res, y_train_res)

In [182]:
# Run the Model with default settings
train_preds = clf.predict(x_train_res)
test_preds  = clf.predict(test_x)

In [183]:
# Find Recall and Precision Scores  
train_score_r = recall_score(y_train_res, train_preds, pos_label=1, average="binary")
train_score_p = precision_score(y_train_res, train_preds, pos_label=1, average="binary")
    
dev_score_r = recall_score(test_y, test_preds, pos_label=1, average="binary")
dev_score_p = precision_score(test_y, test_preds, pos_label=1, average="binary")
    
#print("Train Accuracy:", train_score, ", Dev Accuracy:", dev_score)
print("Train Recall:", train_score_r, ", Train Precision:", train_score_p)
print("Dev Recall:", dev_score_r, ", Dev Precision:", dev_score_p)

#print("AUC Score")
#print (roc_auc_score(test_y,dev_preds_GBM_Proba[:,1]))
#AUC = roc_auc_score(test_y,dev_preds_GBM_Proba[:,1])
#print(AUC)


# Print Confusion Matrix for Performance Analysis
cfmatrix = get_custom_confusionMatrix(test_preds,test_y.ravel())
print(cfmatrix)
print("tp:", int(cfmatrix[0][0]), ", fp:", int(cfmatrix[0][1]),", fn:", int(cfmatrix[1][0]),", tn:", int(cfmatrix[1][1]))

Train Recall: 1.0 , Train Precision: 1.0
Dev Recall: 0.5746268656716418 , Dev Precision: 0.5539568345323741
[[ 77.  62.]
 [ 57. 758.]]
tp: 77 , fp: 62 , fn: 57 , tn: 758


In [184]:
from sklearn.naive_bayes import GaussianNB


gnb = GaussianNB()
gnb.fit(x_train_res, y_train_res)

# Run the Model with default settings
train_preds = gnb.predict(x_train_res)
test_preds  = gnb.predict(test_x)


In [185]:
# Find Recall and Precision Scores  
train_score_r = recall_score(y_train_res, train_preds, pos_label=1, average="binary")
train_score_p = precision_score(y_train_res, train_preds, pos_label=1, average="binary")
    
dev_score_r = recall_score(test_y, test_preds, pos_label=1, average="binary")
dev_score_p = precision_score(test_y, test_preds, pos_label=1, average="binary")
    
#print("Train Accuracy:", train_score, ", Dev Accuracy:", dev_score)
print("Train Recall:", train_score_r, ", Train Precision:", train_score_p)
print("Dev Recall:", dev_score_r, ", Dev Precision:", dev_score_p)

#print("AUC Score")
#print (roc_auc_score(test_y,dev_preds_GBM_Proba[:,1]))
#AUC = roc_auc_score(test_y,dev_preds_GBM_Proba[:,1])
#print(AUC)


# Print Confusion Matrix for Performance Analysis
cfmatrix = get_custom_confusionMatrix(test_preds,test_y.ravel())
print(cfmatrix)
print("tp:", int(cfmatrix[0][0]), ", fp:", int(cfmatrix[0][1]),", fn:", int(cfmatrix[1][0]),", tn:", int(cfmatrix[1][1]))

Train Recall: 0.6586553087922118 , Train Precision: 0.6288120824862039
Dev Recall: 0.5149253731343284 , Dev Precision: 0.1751269035532995
[[ 69. 325.]
 [ 65. 495.]]
tp: 69 , fp: 325 , fn: 65 , tn: 495
