<a href="https://colab.research.google.com/github/sie21/ExamenBigData/blob/master/score_lowcost_cor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

try:
    os.remove("metastore_db/db.lck")
    os.remove("metastore_db/dbex.lck")
except:
    pass


from pyspark.sql import SparkSession

def build_spark_session(app_name, memory='4g', executors=4):
    return SparkSession.builder\
                      .appName(app_name)\
                      .config('spark.executor.memory', memory)\
                      .config('spark.executor.instances', executors)\
                      .getOrCreate()

spark_session = build_spark_session(app_name='ok-google')

from pyspark.sql import functions as f



#### 1 - lire les fichiers de données

In [None]:
perimetre = spark_session.read.csv("data_clients/sample_perimetre.csv", header=True)
histo_client_raw = spark_session.read.csv("data_clients/sample_histo_client.csv", header=True)
histo_train_raw = spark_session.read.csv("data_clients/sample_histo_train.csv", header=True)
histo_lowcost_raw = spark_session.read.csv("data_clients/sample_histo_lowcost.csv", header=True)
visites_raw = spark_session.read.csv("data_clients/sample_visites.csv", header=True)

#### 2 - identifier les variables continues et transformer leurs modalités en double.

In [None]:
print("Nombre d'individu : {}".format(perimetre.count()))
perimetre.show(5)

Nombre d'individu : 1084217
+--------------------+
|           ID_CLIENT|
+--------------------+
|0023d2b0a410eb572...|
|0026decd53a30d9b3...|
|002f0b8e5d2236008...|
|00352dc1e7e43436f...|
|005a10c0d3a94096c...|
+--------------------+
only showing top 5 rows



Ce dataframe est composé que des identifiants des clients.

In [None]:
histo_client_raw.columns

['ID_CLIENT',
 'anciennete',
 'recence_cmd',
 'AGE',
 'LBL_STATUT_CLT',
 'LBL_GEO_AIR',
 'LBL_GRP_SEGMENT_NL',
 'LBL_SEG_COMPORTEMENTAL',
 'LBL_GEO_TRAIN',
 'LBL_SEGMENT_ANTICIPATION',
 'FLG_CMD_CARTE_1225']

In [None]:
histo_client_raw.select('AGE','anciennete','recence_cmd').show(10)

+----+----------+-----------+
| AGE|anciennete|recence_cmd|
+----+----------+-----------+
|null|      1550|         36|
|35.0|      1667|         25|
|25.0|       395|         15|
|31.0|      2188|         20|
|32.0|      3005|         15|
|25.0|      2094|          6|
|21.0|       153|         31|
|25.0|      1329|         33|
|20.0|      1236|          3|
|null|      3591|         13|
+----+----------+-----------+
only showing top 10 rows



toutes les colonnes de ce dataframe sont continues

In [None]:
histo_train_raw.columns

['ID_CLIENT',
 'nb_od',
 'mean_nb_passagers',
 'mean_duree_voyage',
 'mean_mt_voyage',
 'mean_tarif_loisir',
 'mean_classe_1',
 'mean_pointe',
 'mean_depart_we']

Toutes les colonnes constituants ce dataframe sont également continues

In [None]:
histo_lowcost_raw.show(5)

+--------------------+---------------+--------------------+------------+
|           ID_CLIENT|flg_cmd_lowcost|flg_track_nl_lowcost|flg_track_nl|
+--------------------+---------------+--------------------+------------+
|003fb9dca8de37438...|              1|                   0|           1|
|0225a0a30f58ab70d...|              1|                   0|           0|
|024110078fb4581a7...|              1|                   0|           1|
|028fd9538c6857cad...|              1|                   0|           0|
|04a57657f50047cf8...|              1|                   0|           0|
+--------------------+---------------+--------------------+------------+
only showing top 5 rows



In [None]:
histo_lowcost_raw.groupby('flg_cmd_lowcost').count().show()

+---------------+------+
|flg_cmd_lowcost| count|
+---------------+------+
|              1|104306|
+---------------+------+



In [None]:
histo_lowcost_raw.groupby('flg_track_nl_lowcost').count().show()

+--------------------+-----+
|flg_track_nl_lowcost|count|
+--------------------+-----+
|                   0|90641|
|                   1|13665|
+--------------------+-----+



In [None]:
histo_lowcost_raw.groupby('flg_track_nl').count().show()

+------------+-----+
|flg_track_nl|count|
+------------+-----+
|           0|36443|
|           1|67863|
+------------+-----+



les colonnes dans ce dataframe représentent des labels donc des variables qualitatives

In [None]:
visites_raw.show(5)

+--------------------+---------------------+-------------------+
|           ID_CLIENT|days_since_last_visit|      tx_conversion|
+--------------------+---------------------+-------------------+
|000843db32fbaecfb...|                    8| 0.1111111111111111|
|001338752ea32d9de...|                    3|0.13043478260869565|
|003fb9dca8de37438...|                   15|                1.0|
|004efa6652e570ef6...|                   17|              0.125|
|005dd0b718a8f4598...|                   15| 0.3333333333333333|
+--------------------+---------------------+-------------------+
only showing top 5 rows



Les colonnes de ce dataframe représentent des variables continues

In [None]:
## ecrire une fonction pour transformer les features quantitatives ("anciennete", "recence_cmd", "AGE", etc..) en float

In [None]:
def cast_columns_of_df(df, cols_to_cast, col_to_keep, cast_type='double'):
    """cast continuous columns into double since all columns are """
    return df.select(col_to_keep + [(df[feature].cast(cast_type))
                    for feature in cols_to_cast if 'ID_CLIENT' not in feature])

In [None]:
client_cols_to_keep = ["ID_CLIENT", 'LBL_STATUT_CLT','LBL_GEO_AIR',
            'LBL_SEG_COMPORTEMENTAL','LBL_GEO_TRAIN','LBL_GRP_SEGMENT_NL',
            'LBL_SEGMENT_ANTICIPATION','FLG_CMD_CARTE_1225','recence_cmd']


histo_client = cast_columns_of_df(histo_client_raw,
                                  ["anciennete","AGE"],
                                  client_cols_to_keep,
                                 cast_type='int')

client_cols_to_keep = ["ID_CLIENT", 'LBL_STATUT_CLT','LBL_GEO_AIR',
            'LBL_SEG_COMPORTEMENTAL','LBL_GEO_TRAIN','LBL_GRP_SEGMENT_NL',
            'LBL_SEGMENT_ANTICIPATION','FLG_CMD_CARTE_1225','anciennete','AGE']

histo_client = cast_columns_of_df(histo_client,
                                  ["recence_cmd"],
                                  client_cols_to_keep,
                                 cast_type='double')

In [None]:

histo_train = cast_columns_of_df(histo_train_raw, histo_train_raw.columns,
                                 ["ID_CLIENT"], cast_type='double')

train_to_keep = ["ID_CLIENT","mean_nb_passagers", "mean_duree_voyage", "mean_mt_voyage", 
                 "mean_tarif_loisir" ,"mean_classe_1" ,"mean_pointe", "mean_depart_we"]

histo_train = cast_columns_of_df(histo_train, ["nb_od"],
                                 train_to_keep, cast_type = 'int')

In [None]:
histo_lowcost = cast_columns_of_df(histo_lowcost_raw, histo_lowcost_raw.columns,
                                 ["ID_CLIENT"], cast_type='int')


In [None]:
visites = cast_columns_of_df(visites_raw, ["days_since_last_visit"],
                             ["ID_CLIENT","tx_conversion"], 
                             cast_type='int')

visites = cast_columns_of_df(visites_raw, ["tx_conversion"],
                             ["ID_CLIENT","days_since_last_visit"], 
                             cast_type='double')

faire une jointure entre les informations des différentes tables.
NB: on conservera tous les clients de la table perimetre.
    En effet, ce sont les cleints qu'on souhaite scorer

In [None]:
df = perimetre\
    .join(histo_client, on = 'ID_CLIENT', how = 'left_outer')\
    .join(histo_train, on = 'ID_CLIENT', how = 'left_outer')\
    .join(histo_lowcost, on = 'ID_CLIENT', how = 'left_outer')\
    .join(visites, on = 'ID_CLIENT', how = 'left_outer')\

combien a t'on de features quatitatives, qualitatives 

In [None]:
#Fonction pour compter le nombre de variables qualitatives et quantitatives
def count_types_col(df, ignored_col):
    quali = 0
    quanti = 0
    for col in df.columns:
        if col not in ignored_col and df.select(col).dtypes[0][1] == 'string':
            quali += 1
        else:
            quanti += 1
    return quanti,quali

compt = count_types_col(df,['ID_CLIENT'])
print("Variables qualitatives : {}".format(compt[1]))
print("Variables quantitatives : {}".format(compt[0]))

Variables qualitatives : 8
Variables quantitatives : 16


quelles sont les differentes modalites de la feature LBL_STATUT_CLT

In [None]:
#Modalités de la variable LBL_STATUT_CLT
df.select('LBL_STATUT_CLT').distinct().collect()

[Row(LBL_STATUT_CLT='Moyen moins'),
 Row(LBL_STATUT_CLT='Non present dans la base a cette date'),
 Row(LBL_STATUT_CLT='Nouveau prospect'),
 Row(LBL_STATUT_CLT='Prospect'),
 Row(LBL_STATUT_CLT='Tres petit'),
 Row(LBL_STATUT_CLT=None),
 Row(LBL_STATUT_CLT='Petit'),
 Row(LBL_STATUT_CLT='Inactif'),
 Row(LBL_STATUT_CLT='Nouveau actif'),
 Row(LBL_STATUT_CLT='Grand'),
 Row(LBL_STATUT_CLT='Tres grand'),
 Row(LBL_STATUT_CLT='Moyen plus')]

quelles sont les features avec valeurs manquantes
remplacer les valeurs manquantes par -1 pour toutes les features

In [None]:
#Fonction pour compter le nombre de valeurs manquantes dans chaque features
def count_nan_values(df):
    from pyspark.sql.functions import col, sum
    return df.select(*(sum(col(c).isNull().cast('int')).alias(c) for c in df.columns)).toPandas()
    
count_nan_values(df)

Unnamed: 0,ID_CLIENT,LBL_STATUT_CLT,LBL_GEO_AIR,LBL_SEG_COMPORTEMENTAL,LBL_GEO_TRAIN,LBL_GRP_SEGMENT_NL,LBL_SEGMENT_ANTICIPATION,FLG_CMD_CARTE_1225,anciennete,AGE,...,mean_tarif_loisir,mean_classe_1,mean_pointe,mean_depart_we,nb_od,flg_cmd_lowcost,flg_track_nl_lowcost,flg_track_nl,days_since_last_visit,tx_conversion
0,0,78998,162977,155160,163010,79522,157822,10283,55,169311,...,62370,49927,49927,49927,49927,979911,979911,979911,64142,64142


In [None]:
##Fonction pour remplacer les valeurs manquantes par -1 pour les variables qualtatives 
##et la moyenne pour celles qui sont quantitatives
def replace_missing_val(df,qualifies_columns, continuous_columns):
    
    return df.select([f.when(df[feature].isNotNull(), df[feature])\
                      .otherwise('-1').alias(feature) for feature in qualifies_columns]\
                     +[f.when(df[feature].isNotNull(), df[feature])\
                       .otherwise(df.select(f.mean(df[feature])).collect()[0][0])\
                       .alias(feature) for feature in continuous_columns])

In [None]:
#def replace_missing_val2(df,qualifies_columns, continuous_columns):
#    dict_mean = {feat: df.select(f.mean(feat)).collect()[0][0] 
#                 for feat in continuous_columns}
#    return df.select([f.when(df[feature].isNotNull(), df[feature])\
#                      .otherwise('-1').alias(feature) for feature in qualifies_columns]\
#                     +[f.when(df[feature].isNotNull(), df[feature])\
#                       .otherwise(dict_mean[feature]).alias(feature) 
#                       for feature in continuous_columns])

In [None]:
qualifies_columns = ["ID_CLIENT", 'LBL_STATUT_CLT','LBL_GEO_AIR',
            'LBL_SEG_COMPORTEMENTAL','LBL_GEO_TRAIN','LBL_GRP_SEGMENT_NL',
            'LBL_SEGMENT_ANTICIPATION','FLG_CMD_CARTE_1225','flg_cmd_lowcost']

continuous_columns = list(set(df.columns).difference(set(qualifies_columns)))


df = replace_missing_val(df, qualifies_columns,continuous_columns)

In [None]:
def input_df(df):
    ds = df.select('ID_CLIENT',
    f.when(df.LBL_GEO_TRAIN.isin(['Toulouse', 'Lille', 'Dijon',
                                  'Lyon', 'Marseille', 'Paris',
                                  'Nice', 'Limoges','Rouen','Rennes',
                                  'Montpellier', 'Bordeaux', 'Metz',
                                  'Strasbourg']), df.LBL_GEO_TRAIN)\
               .otherwise('na').alias('geo_train'),
    f.when(df.LBL_GEO_AIR.isin(['Aéroports de Paris Orly',
                                'Aéroport de Bâle-Mulhouse / Bassel',
                                'Aéroport Lille Lesquin', 'Aéroport de Rennes',
                                'Aéroport de Nantes Atlantique',
                                'Aéroport de Marseille Provence  (MRS)', 
                                'Aéroport de Bordeaux Mérignac',
                                'Aéroports de Paris Roissy-Charles-de Gaulle', 
                                "Aéroport de Nice Côte d'Azur",
                                'Aéroport de Strasbourg',
                                'Aéroport de Lyon - Saint Exupéry', 
                                'Aéroport de Toulouse Blagnac']), df.LBL_GEO_AIR)\
               .otherwise('na').alias('geo_air'),
    f.when(df.FLG_CMD_CARTE_1225 == '1', '1')\
                   .otherwise('0').alias('cc_jeunes'),
    f.when(df.LBL_STATUT_CLT.isin(['Tres grand', 'Nouveau actif',
                                   'Moyen moins', ' Prospect', ' Petit',
                                   'Inactif', 'Tres petit',
                                   'Nouveau prospect', 'Moyen plus',
                                   'Grand']), df.LBL_STATUT_CLT)\
                   .otherwise('na').alias('segt_rfm'),
    f.when(df.LBL_SEGMENT_ANTICIPATION.isin(['Peu Anticipateur', 'Tres Anticipateur',
                                             'Anticipateur', 'Mixte', 'Non Anticipateur',
                                             'Non Defini']), df.LBL_SEGMENT_ANTICIPATION)\
                   .otherwise('na').alias('segt_anticipation'),
    f.when(df.LBL_SEG_COMPORTEMENTAL.isin(['Mono-commande',
                                           'Comportement Pro',
                                           'Exclusifs Agence', 
                                           'Anticipateurs Methodiques',
                                           'Chasseurs Bons Plans', 
                                           'Rythmes scolaires', 'Nouveaux',
                                           'Sans contraintes']),
           df.LBL_SEG_COMPORTEMENTAL).otherwise('na').alias('segt_comportemental'), 
    f.when(df.LBL_GRP_SEGMENT_NL.isin(['Endormi', 'Spectateur', 'Acteur',
                                       'Eteint', 'Non defini']),
           df.LBL_GRP_SEGMENT_NL).otherwise('na').alias('segt_nl'),
    f.when(((df.AGE > 0) & (df.AGE < 100)), df.AGE)\
                   .otherwise(-1).alias('age'),
    f.when(df.recence_cmd >= 0, df.recence_cmd)\
                   .otherwise(-1).alias('recence_cmd'),
    f.when(((df.mean_duree_voyage > 0) & (df.mean_duree_voyage < 750)),
           df.mean_duree_voyage).otherwise(-1).alias('mean_duree_voyage'),
    f.when(df.days_since_last_visit >= 0, df.days_since_last_visit)\
                   .otherwise(-1).alias('recence_visite'),
    f.when(df.mean_mt_voyage > 0, df.mean_mt_voyage)\
                   .otherwise(-1).alias('mean_mt_voyage'),
    f.when(df.anciennete >= 0, df.anciennete)\
                   .otherwise(-1).alias('anciennete'),
    f.when(df.nb_od > 0, df.nb_od)\
                   .otherwise(-1).alias('nb_od'),
    f.when(df.mean_nb_passagers > 0, df.mean_nb_passagers)\
                   .otherwise(-1).alias('mean_nb_passagers'),
    f.when(df.mean_tarif_loisir >= 0, df.mean_tarif_loisir)\
                   .otherwise(-1).alias('mean_tarif_loisir'),
    f.when(df.mean_classe_1 >= 0, df.mean_classe_1)\
                   .otherwise(-1).alias('mean_classe_1'),
    f.when(df.mean_pointe >= 0, df.mean_pointe)\
                   .otherwise(-1).alias('mean_pointe'),
    f.when(df.mean_depart_we >= 0, df.mean_depart_we)\
                   .otherwise(-1).alias('mean_depart_we'),
    f.when(df.tx_conversion >= 0, df.tx_conversion)\
                   .otherwise(-1).alias('tx_conversion'),
    f.when(df.flg_cmd_lowcost == 1, '1')\
                   .otherwise('0').alias('flg_cmd_lowcost'),
    f.when(df.flg_track_nl_lowcost == 1, '1')\
                   .otherwise('0').alias('flg_track_nl_lowcost'), 
    f.when(df.flg_track_nl == 1, '1')\
                   .otherwise('0').alias('flg_track_nl'))
    
    return ds
df1 = input_df(df)

Quelles sont les differentes valeurs de notre label : flg_cmd_lowcost

In [None]:
df1.groupby('flg_cmd_lowcost').count().show()

+---------------+------+
|flg_cmd_lowcost| count|
+---------------+------+
|              0|979911|
|              1|104306|
+---------------+------+



In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression


##### features engineering et modélisation

In [None]:

def preprocessed_df(df, label="flg_cmd_lowcostIndex"):
    max_values_to_define_str_cols = 10
    id_col = 'ID_CLIENT'
    
    dty = dict(df.dtypes)
    str_cols = [k for k, v in dty.items() if v == 'string']
    str_cols.remove(id_col)
    
    for c in str_cols:
        stringIndexer = StringIndexer(inputCol=c, outputCol=c+"Index")
        model_str = stringIndexer.fit(df)
        df = model_str.transform(df).drop(c)

    input_cols = df.columns
    input_cols.remove(id_col)
    input_cols.remove(label)
    
    assembler = VectorAssembler(inputCols=input_cols,
                            outputCol="features")
    df = assembler.transform(df)
    
    featureIndexer = VectorIndexer(inputCol="features", 
                   outputCol="indexedFeatures", 
                   maxCategories=max_values_to_define_str_cols).fit(df)
    return featureIndexer.transform(df), df


data, dff = preprocessed_df(df1)



In [None]:
data.take(1)

[Row(ID_CLIENT='000843db32fbaecfbb047ca0bb04b1f9f4d9425a', age=36.77269796022761, recence_cmd=36.0, mean_duree_voyage=274.6666666666667, mean_mt_voyage=58.666666666666664, anciennete=1550.0, nb_od=1.0, mean_nb_passagers=1.0, mean_tarif_loisir=0.0, mean_classe_1=0.0, mean_pointe=0.0, mean_depart_we=0.0, tx_conversion=0.1111111111111111, geo_trainIndex=0.0, geo_airIndex=2.0, cc_jeunesIndex=0.0, segt_rfmIndex=2.0, segt_anticipationIndex=4.0, segt_comportementalIndex=6.0, segt_nlIndex=1.0, recence_visiteIndex=7.0, flg_cmd_lowcostIndex=0.0, flg_track_nl_lowcostIndex=0.0, flg_track_nlIndex=0.0, features=DenseVector([36.7727, 36.0, 274.6667, 58.6667, 1550.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.1111, 0.0, 2.0, 0.0, 2.0, 4.0, 6.0, 1.0, 7.0, 0.0, 0.0]), indexedFeatures=DenseVector([36.7727, 36.0, 274.6667, 58.6667, 1550.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.1111, 0.0, 2.0, 0.0, 2.0, 4.0, 6.0, 1.0, 7.0, 0.0, 0.0]))]

In [None]:
dff.take(1)

[Row(ID_CLIENT='000843db32fbaecfbb047ca0bb04b1f9f4d9425a', age=36.77269796022761, recence_cmd=36.0, mean_duree_voyage=274.6666666666667, mean_mt_voyage=58.666666666666664, anciennete=1550.0, nb_od=1.0, mean_nb_passagers=1.0, mean_tarif_loisir=0.0, mean_classe_1=0.0, mean_pointe=0.0, mean_depart_we=0.0, tx_conversion=0.1111111111111111, geo_trainIndex=0.0, geo_airIndex=2.0, cc_jeunesIndex=0.0, segt_rfmIndex=2.0, segt_anticipationIndex=4.0, segt_comportementalIndex=6.0, segt_nlIndex=1.0, recence_visiteIndex=7.0, flg_cmd_lowcostIndex=0.0, flg_track_nl_lowcostIndex=0.0, flg_track_nlIndex=0.0, features=DenseVector([36.7727, 36.0, 274.6667, 58.6667, 1550.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.1111, 0.0, 2.0, 0.0, 2.0, 4.0, 6.0, 1.0, 7.0, 0.0, 0.0]))]

In [None]:
df_sample = data.sampleBy("flg_cmd_lowcostIndex",{0:0.1,1:0.2}, seed = 2)

In [None]:
df_sample.groupby('flg_cmd_lowcostIndex').count().show()

+--------------------+-----+
|flg_cmd_lowcostIndex|count|
+--------------------+-----+
|                 0.0|98072|
|                 1.0|20922|
+--------------------+-----+



In [None]:
def compute_model(df,model,labelcol,featurescol,weightSplit,**kwargs):
    
    from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
    from pyspark.mllib.evaluation import MulticlassMetrics
    from pyspark.ml.evaluation import BinaryClassificationEvaluator

    #cut data according the rate *weightSplit* for the test
    train,test = df.randomSplit([1-weightSplit,weightSplit], seed = 2)
    
    #In the logistic regression case
    if model == 'lr':
        if 'elasticNetParam' is kwargs:
            el = kwargs['elasticNetParam']
        else:
            el = 0.5
        lr = LogisticRegression(labelCol = labelcol,
                               featuresCol = featurescol,
                               elasticNetParam = el)
        #Fitting model for logistic regression
        ourmodel = lr.fit(train)
    
    #In the randomForest case
    elif model == 'rf':
        if 'numTrees' in kwargs:
            numTrees = kwargs['numTrees']
        else: #default value for numTrees
            numTrees = 20
            
        if 'maxDepth' in kwargs:
            maxDepth = kwargs['maxDepth']
        else: #default value for maxDepth
            maxDepth = 5
            
        if 'minInstancesPerNode' in kwargs:
            minInstancesPerNode = kwargs['minInstancesPerNode']
        else: #default value for minInstancesPernode
            minInstancesPerNode = 1     
            
        if 'featureSubsetStrategy' in kwargs:
            featureSubsetStrategy = kwargs['featureSubsetStrategy']
        else: #default value for featureSubsetStrategy
            featureSubsetStrategy = 'auto'
        
        rf = RandomForestClassifier(featuresCol = featurescol, labelCol = labelcol,
                                    minInstancesPerNode = minInstancesPerNode, numTrees = numTrees, 
                                    featureSubsetStrategy = featureSubsetStrategy, maxBins = 63)
        #Fitting model for random Forest    
        ourmodel = rf.fit(train)
    
    else : 
        print("Choix indisponible. Veuillez choisir 'lr'ou 'rf'. ")
    
    #Calcul de la prediction
    if model in ['rf','lr']:
    
        #prediction test set
        pred = ourmodel.transform(test)
        
        #Computing of the performance indices
        predictionAndLabels = pred.select('prediction',labelcol).rdd
        
        perf = dict()
        perf['AUC'] = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                           labelCol='flg_cmd_lowcostIndex',
                                          metricName = 'areaUnderROC').evaluate(pred)
        metrics = MulticlassMetrics(predictionAndLabels)
        perf['accuracy'] = metrics.accuracy
        perf['confu_matrix'] = metrics.confusionMatrix().toArray()
        
        #Notre label d'interet est '1' 
        perf['precision'] = metrics.precision(1)
        perf['recall'] = metrics.recall(1)
        
    return perf

In [None]:
#Logistic regression model
performance = compute_model(df_sample,"lr", "flg_cmd_lowcostIndex","indexedFeatures",weightSplit=0.25)
print("Confusion Matrix for Logitic Regression")
print(performance['confu_matrix'])

Confusion Matrix for Logitic Regression
[[2.438e+04 3.000e+00]
 [1.819e+03 3.393e+03]]


In [None]:
print("Logistic Regression performance")
print("-------------------------------")
for ind,val in performance.items():
    if ind != 'confu_matrix':
        print("{} : {}".format(ind,val))

Logistic Regression performance
-------------------------------
AUC : 0.825437330539511
accuracy : 0.9384355465450245
precision : 0.9991166077738516
recall : 0.6509976976208749


In [None]:
#RandomForest model
performance = compute_model(df_sample,"rf", "flg_cmd_lowcostIndex","indexedFeatures",weightSplit=0.25)
print("Confusion Matrix for RandomForest")
print(performance['confu_matrix'])

Confusion Matrix for RandomForest
[[24383.     0.]
 [ 1819.  3393.]]


In [None]:
print("Random Forest performance")
print("-------------------------")
for ind,val in performance.items():
    if ind != 'confu_matrix':
        print("{} : {}".format(ind,val))

Random Forest performance
-------------------------
AUC : 0.8254988488104374
accuracy : 0.9385369150194289
precision : 1.0
recall : 0.6509976976208749
