<a href="https://colab.research.google.com/github/said-chafouai/E-Colocation-WebApplication/blob/main/Clustring_Sujet_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import KModes
from kmodes.kmodes import KModes

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# package for detecting a file encoding
import chardet

from sklearn.cluster import KMeans
# for converting categorical values to numerical ones
from sklearn.preprocessing import LabelEncoder

# for standardization
from sklearn.preprocessing import StandardScaler

# to compute NearestNeighbors for DBSCAN
from sklearn.neighbors import NearestNeighbors

# for DBSCAN
from sklearn.cluster import DBSCAN

# to mount Google drive in Colab
from google.colab import drive
drive.mount('/content/drive') # Acces path : '/content/MyDrive/Monext/'
gDrivePath='/content/drive/MyDrive/Monext'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
### Functions

########################################################################################################################
def read_csv_file_into_df(file_path):
    """
    read content of a csv file to a dataframe
    :param file_path: csv_file to read from
    :return: dataframe
    """
    # with open(file_path, 'rb') as file:
    #     result = chardet.detect(file.read(10000))
    return pd.read_csv(file_path, delimiter=';', encoding='latin1')

########################################################################################################################
def write_df_into_csv_file(file_path, dataframe):
    with open(file_path, mode='w') as file:
        dataframe.to_csv(file, index=False, sep=';', line_terminator='\n', encoding='latin1' )

########################################################################################################################
def Kmeans_estimator_elbow_method(X: pd.DataFrame, cluster_range: []):
  """
  Performe Kmeans on the X given dataset and trace the elbow graphe
  """
  inertia_list = []
  for i in cluster_range:
    kmeans_estimator = KMeans(n_clusters=i, random_state=10)
    kmeans_estimator.fit(X)
    inertia = kmeans_estimator.inertia_
    inertia_list.append(inertia)
    print(f'cluster {i}, Inertia {inertia}\n')
  
  # Plot the squared distance for each K
  plt.plot(cluster_range, inertia_list, 'bx-')
  plt.show() # -> the best K is when the graphe stated to be linear

########################################################################################################################
def preprocessing_convert_df_to_numerical_values(X: pd.DataFrame):
  """
  Convert each column of the dataframe X to numrical ones
  return : X the modified Dataframe
  """
  X_numpy = X.to_numpy()

  for i in range(X_numpy.shape[1]):
    label_encoder = LabelEncoder()
    unique_values = label_encoder.fit_transform(X_numpy[:,i:i+1].ravel())
    mapping_unique_values = {
        index: label for index, label in enumerate(label_encoder.classes_)
    }
    X_numpy[:,i:i+1] = unique_values.reshape(unique_values.size,1)

  X = pd.DataFrame(X_numpy, columns=X.columns)
  X.fillna(-1) # attention n'est pas encore testée
  return X

########################################################################################################################
# def standardaize_dataset(X: pd.DataFrame, estimator_type, ):
#   """
#   standardize the given dataset
#   X: the dataframe to standardize
#   estimator_type: string 'supervised ou unsupervised' to know if i should standardize 
#   the last column or not 
#   """
#   X_numpy
#   if estimator_type == 'supervised':

#   X_standarded =  StandardScaler().fit_transform(X)
#   return X_standarded
########################################################################################################################
def do_PCA(X, new_dimension):
  """
  Do de PCA on the given dataset, based on new_dimension 
  return: reduced dataset
  """
  from sklearn.decomposition import PCA
  pca = PCA(2)
  X = pca.fit_transform(X)
  return X
########################################################################################################################
def filter_df(df, columns:[]):
  """
  filter a dataframe
  df: dataframe to filter
  columns : columns to extract
  return: dataframe
  """
  X = df.filter(items=columns)
  return X

########################################################################################################################
def do_kmeans(X: pd.DataFrame, n_clusters):
  """
  do kmeans fo the given clusters
  return: cluster_labels
  """
  kmeans_estimator = KMeans(n_clusters=n_clusters, random_state=10)
  cluster_labels = kmeans_estimator.fit_predict(X)
  return cluster_labels 

def plot_clusters_2D(df_trans, cluster_labels, figsize:tuple):
  unique_clusters = np.unique(cluster_labels)
  plt.figure(figsize=figsize)
  for cluster in unique_clusters:
    plt.scatter(df_trans[cluster_labels == cluster, 0], df_trans[cluster_labels == cluster, 1], label=cluster)
  plt.legend()
  plt.show()

########################################################################################################################
def plot_clusters_threshold(trans_file_path, columns, column_weight_file_path, threshold):
  """
  this function extract classes that have a number of samples greather than threshold
  the dataset of the given csv file must 
  """
  df_trans = read_csv_file_into_df(file_path)
  df_trans = read_csv_file_into_df(file_path)
  df_trans = filter_df(df_trans, columns)

########################################################################################################################
def do_clustring_presonnalized_algo(df_trans, columns, threshold, similarity):
  """
  this function do a presonnalized clustring
  file_path: 
  columns : list of columns ordered by importance
  threshold: show clusters with threshold elements
  similarity: the number of columns used to calculate similarity between clusters (must be between 1 and nb_columns)
  """
  df_trans['UniqueNumber'] = np.arange(df_trans.shape[0])
  df_trans['Counter'] = 1
  columns.append('UniqueNumber')
  df_groupedBy = df_trans.groupby(by=columns, as_index=False)['Counter'].count()
  df_groupedBy['Cluster'] = 0
  
  nb_cluster = 1
  old_row = df_groupedBy.iloc[0, :]

  if similarity > 1:
    for index, row in df_groupedBy.iterrows():
        if old_row[similarity - 2] != row[similarity - 2] or old_row[similarity - 1] != row[similarity - 1]:
          old_row = row
          nb_cluster += 1
        df_groupedBy.at[index, 'Cluster'] = nb_cluster

  elif similarity == 1: 
    for index, row in df_groupedBy.iterrows():
        if old_row[similarity - 1] != row[similarity - 1]:
          old_row = row
          nb_cluster += 1
        df_groupedBy.at[index, 'Cluster'] = nb_cluster

  elif similarity < 0 or similarity > df_trans.shape[1] - 2 :  
    print('Wrong similarity')

  df_trans.drop(columns=['UniqueNumber','Counter'], inplace=True)
  columns.remove('UniqueNumber')
  df_groupedBy.drop(columns=['UniqueNumber'], inplace=True)
  return df_groupedBy

########################################################################################################################
def do_dbscan(df):
  min_pts = 2*df.shape[1] - 2 # minus two for added columns UniqueNumbers and Counter
  df_numeric = preprocessing_convert_df_to_numerical_values(df) # categorial to numeric
  X_standarded =  StandardScaler().fit_transform(df_numeric) # Standardization

  # finding epsilon
  min_pts = 2*df.shape[1] - 2
  neighbors_algo = NearestNeighbors(n_neighbors=min_pts)
  neighbors = neighbors_algo.fit(X_standarded)
  distances, index = neighbors.kneighbors(X_standarded)
  mean_distance = np.mean(distances)
  epsilon = mean_distance

  # process dbscan
  dbscan_estimator = DBSCAN(eps=epsilon, min_samples=min_pts).fit(X_standarded)
  return dbscan_estimator.labels_

########################################################################################################################
def global_dbscan(df_trans, groupBy_columns):
  """
  This function is used with DBSCAN
  file_path: 
  columns : list of columns ordered by importance
  threshold: show clusters with threshold elements
  similarity: the number of columns used to calculate similarity between clusters (must be between 1 and nb_columns)
  """
  nb_groupBy_columns = len(groupBy_columns) 
  df_trans['UniqueNumber'] = np.arange(df_trans.shape[0])
  df_trans['Counter'] = 1
  df_trans_columns = list(df_trans.columns)

  if nb_groupBy_columns > 0 : 
    df_trans_columns.remove(groupBy_columns)
  
  groupBy_columns.extend(df_trans_columns)

  print('Group by columns: ', groupBy_columns) 

  clusters = []


  df_groupedBy = df_trans.groupby(by=groupBy_columns, as_index=False)['Counter'].count()

  old_row = df_groupedBy.iloc[0, :]
  start_index = 0

  if nb_groupBy_columns > 0:
    for index, row in df_groupedBy.iterrows():
      if old_row[nb_groupBy_columns - 1] != row[nb_groupBy_columns - 1]:
        old_row = row
        end_index = index
        # process DBSCAN on dataset starting from start_index to end_indexy for row and for columns from nb_groupBy_columns to the end
        labels = do_dbscan(df_groupedBy.iloc[start_index:end_index, nb_groupBy_columns:-1])
        clusters.extend(labels)
  else: 
    clusters.append(do_dbscan(df_trans))
  df_groupedBy['Cluster'] = clusters
########################################################################################################################
  
def mapping_dataset_to_meaning_values(df_trans):
  mapping_columns = {
    'ACTION': {
      'AUTHOR+CAP':  1,
      'AUTHOR':  2,
      'RESET':  3,
      'REFUND':  4
    },
    'PAYMENT_CARD_CODE':{
      'CB': 1,
      'PAYPAL': 2,
      'PAYSAFECARD2':3,
      'SKRILL(MONEYBOOKERS)': 4
    },
    'SHORT_MESSAGE':{
      'REFUSED': 4,
      'ACCEPTED': 1,
      'ERROR': 4,
      'CANCELLED': 3,
      'ONHOLD_PARTNER': 2
    },
    'FRAUD_LIST':{
      'StandardList': 1,
      'NewCustomerList': 1,
      'BlackList': 2
    },
    'FRAUD_RULE':{
      'Cartes Business': 1,
      'Cartes Corporate': 1,
      'Montant 3DS-v2-FLS': 2,
      'Cumul7j 3DS-v2-NCH': 3,
      'OneCardNCustomers': 4,
      'ListCountryIssuer': 5,
      'Unicite du compte Paypal par compte joueur': 6,
      'Paysafecard - Cumul client > 1000 EUR par jour calendaire': 7,
      'Unicite du compte Skrill par compte joueur': 8,
      'Paysafecard - Cumul client > 10000 EUR par mois': 9
    },
    'FRAUD_EXPLANATION': {
      'AFF': 1,
      '3DS': 1,
      'CUC': 1,
      'UNI': 1,
      'CTY': 1,
      'PSF': 1
    },
    'FRAUD_ACTION': {
      'BLOCK': 5,
      'FRICTIONLESS': 3,
      'NONE': 1,
      'NO_PREFERENCE': 4,
      'noAction': 2
    },
    'ENROLLED_3D_SECURE':{
      'Y': 1,
      'N': 3
    },
    'AUTHENTICATED_3D_SECURE': {
      'Y': 1,
      'R': 2,
      'N': 6,
      'A': 3,
      'U': 5
    },
    'SECURITY_LEVEL': {
      'CVV': 1,
      '3DS': 2,
      'Aucun': 3
    },
    'CHARGE_BACK': {
        'N': 1
    },
    'TRANSMITTER_COUNTRY':{
        'FRA': 1
    },
    'BUYER_IP_COUNTRY_CODE':{
        'FR': 1
    }
  }
  df_trans['IS_DUPLICATED'].replace(to_replace='0.0', value=1, inplace=True)
  df_trans.replace(to_replace='vide', value=0, inplace=True)
  df_trans.replace(to_replace='-', value=0, inplace=True) # Attention à revoir
  df_trans.replace(mapping_columns, inplace=True)
  df_trans['EXTERNAL_RETURN_CODE'].replace(to_replace=0, value=1, inplace=True)
  df_trans['EXTERNAL_RETURN_CODE'].where(df_trans['EXTERNAL_RETURN_CODE']==1, other=2, inplace=True)
  df_trans['TRANSMITTER_COUNTRY'].where(df_trans['TRANSMITTER_COUNTRY']==1, other=2, inplace=True)
  df_trans['BUYER_IP_COUNTRY_CODE'].where(df_trans['BUYER_IP_COUNTRY_CODE']==1, other=2, inplace=True)
  write_df_into_csv_file(gDrivePath+'/fdj_trans_mapped.csv', df_trans)
  return df_trans

########################################################################################################################
def do_kmodes_and_plot_cost(file_path, clustring_columns, groupby_columns, k):
  """ 
  Parameters:
    file_path : path of the csv file that contain transactions
    clustring_columns : all imporatante columns for the tester to do clustring on
    groupby_columns : abvious clusters column
    K : array k[0] min clusters, k[1] max clusters
  Return :
    plot cost/cluster 
  """
  # load the csv file
  df_trans = read_csv_file_into_df(file_path)
  # print('### dataset shape', df_trans.shape)


  # filter the dataset
  df_trans = df_trans.filter(items = clustring_columns)
  df_trans.fillna('empty', inplace=True)

  df_trans = df_trans.groupby(by=clustring_columns).size().reset_index(name='Count')
  # drop columns
  count = df_trans['Count']
  df_trans.drop(columns=['Count'], inplace=True)
  # change type to string
  df_trans = df_trans.astype('str')


  # chose the optimal K using the elbow method
  # run the algo form k[0](min) to k[1](max)
  cost = []
  rangeK = range(k[0],k[1])
  for ith_k in rangeK:
    try:
      kpro_estimator = KModes(n_clusters=ith_k, n_jobs=-1, random_state=0)
      kpro_estimator.fit_predict(df_trans)
      cost.append(kpro_estimator.cost_)
      print('cluster : ', ith_k)
    except:
      print(ith_k, ' error ')
      cost.append(0)

  # Plot the squared distance for each K
  # plt.figure(figsize=(0.5*len(rangeK), 10))
  plt.plot(rangeK, cost, 'bx-')
  # plt.xticks(rangeK)
  plt.show() # -> the best K is when the graphe started to be linear
  return df_trans.copy(), count

########################################################################################################################
def kmodes_with_chosen_k(df_trans, count, groupby_columns, chosen_k):
  """ 
  Parameters:
    df_trans : grouped by dataframe, exactly the one returned by do_kmodes_and_plot_cost
    clustring_columns : all imporatante columns for the tester to do clustring on
    chosen_k : the optimal k chosed by the testeur
  Return :
    write the clustred dataframe
  """
  kpro_estimator = KModes(n_clusters=chosen_k, n_jobs=-1, random_state=0)
  kpro_estimator.fit_predict(df_trans)
  df_trans['Cluster'] = kpro_estimator.labels_

  maxCluster = chosen_k
  clm_len = len(groupby_columns)
  old_row = df_trans.iloc[0, :clm_len]

  # comment here
  B = [False for i in range(chosen_k)] # visited or not by previous groups
  C = [False for i in range(chosen_k)] # visited or not by the same group
  A = [i for i in range(chosen_k)] # clusters
  if clm_len > 0:
    for index, row in df_trans.iterrows():
      # Group changes
      if df_trans.iloc[index, :clm_len].eq(old_row).sum() < clm_len:
        old_row = df_trans.iloc[index, :clm_len]
        for i in range(len(C)):
          C[i] = False
      
      cluster = df_trans.at[index, 'Cluster'] 
      if B[cluster] == True and C[cluster] == False:
        A[cluster] = maxCluster
        maxCluster += 1

      B[cluster] = True
      C[cluster] = True
      df_trans.at[index, 'Cluster'] = A[cluster]
  
  df_trans['Count'] = count
  write_df_into_csv_file(gDrivePath+'/trans_clustring_final.csv', df_trans)

########################################################################################################################
def mode_func(x):
  return lambda x: x.value_counts().index[0]
########################################################################################################################
def get_centroids(file_path):
  df_trans_clustred = read_csv_file_into_df(file_path)
  objet = {}
  for column in list(df_trans_clustred.columns):
    if type(column) == str:
      objet[column] = mode_func(column)
  objet['Count'] = 'sum'

  df_trans_clustred = df_trans_clustred.groupby(by=['Cluster'], as_index=False).agg(objet)
  write_df_into_csv_file(gDrivePath+'/df_trans_cluster_centroids.csv', df_trans_clustred)

########################################################################################################################
def compute_coverage(file_path_centroids, file_path_tests):
  df_tests = read_csv_file_into_df(file_path_tests)
  df_centroids = read_csv_file_into_df(file_path_centroids)

  columns_intersection = list(set(df_tests.columns) & set(df_centroids.columns))
  df_tests = df_tests.filter(items=columns_intersection)
  df_tests.fillna('empty', inplace=True)
  df_centroids = df_centroids.filter(items=columns_intersection)
  df_coverage = df_tests.copy()

  for iIndex, iRow in df_tests.iterrows():
    for jIndex, jRow in df_centroids.iterrows():
      df_coverage.at[iIndex, str(jIndex)] = int(iRow.eq(jRow).sum())

  write_df_into_csv_file(gDrivePath+'/trans_coverage.csv', df_coverage)


In [None]:
### KModes implementation

# INPUT 
file_path = gDrivePath+'/fdj_transactions.csv'          # transaction file name
# all columns you are interested on
clustring_columns = ['ACTION', 'PAYMENT_CARD_CODE','AUTHENTICATED_3D_SECURE','FRAUD_ACTION','ENROLLED_3D_SECURE',
                      'SECURITY_LEVEL','TRANSMITTER_COUNTRY',
                      'BUYER_IP_COUNTRY_CODE','SHORT_MESSAGE',
                      'FRAUD_LIST','FRAUD_EXPLANATION','CHARGE_BACK']  # - Columns that we gonna use for the clustring
groupby_columns = ['ACTION', 'PAYMENT_CARD_CODE']    # - the columns to do group by with
k = [1, 30]          # K : min and max clusters to preform 

df_trans, count = do_kmodes_and_plot_cost(file_path, clustring_columns, groupby_columns, k)


In [None]:
# Run Kmodes for choosing K
kmodes_with_chosen_k(df_trans, count,  groupby_columns, 16)

In [None]:
# get centroids of clustred data
file_path = gDrivePath+'/trans_clustring_final.csv'
get_centroids(file_path)


In [None]:
# Compute the coverage
file_path_tests = gDrivePath+'/fdj_test.csv'
file_path_centroids = gDrivePath+'/df_trans_cluster_centroids.csv'
compute_coverage(file_path_centroids, file_path_tests)

In [None]:
pip install kmodes

In [None]:
# Don't delete that
groupBy_columns = []
nb_groupBy_columns = len(groupBy_columns) 
# df_trans['UniqueNumber'] = np.arange(df_trans.shape[0])
# df_trans['Counter'] = 1
df_trans_columns = list(df_trans.columns)

if nb_groupBy_columns > 0 : 
  df_trans_columns.remove(groupBy_columns)

groupBy_columns.extend(df_trans_columns)

print('Group by columns: ', groupBy_columns) 

clusters = []

# df_groupedBy = df_trans.groupby(by=groupBy_columns, as_index=False)['Counter'].count()
# df_groupedBy.drop(columns=['Counter', 'UniqueNumber'], inplace=True)

# df_numeric = preprocessing_convert_df_to_numerical_values(df_groupedBy) # categorial to numeric
df_numeric = mapping_dataset_to_meaning_values(df_trans)
X_standarded =  StandardScaler().fit_transform(df_numeric) # Standardization

In [None]:
# finding epsilon
min_pts = 2*df_numeric.shape[1]
neighbors_algo = NearestNeighbors(n_neighbors=min_pts).fit(X_standarded)
distances, index = neighbors_algo.kneighbors(X_standarded)
mean_distance = np.mean(distances)
epsilon = mean_distance

In [None]:
# process dbscan
dbscan_estimator = DBSCAN(eps=epsilon, min_samples=min_pts).fit(X_standarded)

In [None]:
# df_groupedBy['Cluster'] = dbscan_estimator.labels_
df_trans['Cluster'] = dbscan_estimator.labels_ 
# np.min(distances)

In [None]:
write_df_into_csv_file(gDrivePath+'/df_trans_dbscan_clusterd.csv', df_trans)


In [None]:
df_reduced = do_PCA(df_numeric, 2)
plot_clusters_2D(df_reduced, df_groupedBy['Cluster'], (10, 20))

In [None]:
### Clustrin/ Grouping by algorithm
file_path = gDrivePath+'/fdj_trans_filterd_columns.csv'
df_trans = read_csv_file_into_df(file_path)
columns = ['ACTION','PAYMENT_CARD_CODE','AUTHENTICATED_3D_SECURE',
            'FRAUD_ACTION','ENROLLED_3D_SECURE','FRAUD_RULE',
            'SECURITY_LEVEL','EXTERNAL_RETURN_CODE','TRANSMITTER_COUNTRY',
            'BUYER_IP_COUNTRY_CODE','IS_DUPLICATED','SHORT_MESSAGE',
            'FRAUD_LIST','FRAUD_EXPLANATION','CHARGE_BACK','IS_CVD']
threshold = 100
similarity = 3
df_groupedBy = do_clustring_presonnalized_algo(df_trans, columns, threshold, similarity)
write_df_into_csv_file(gDrivePath+'/trans_clustred.csv', df_groupedBy)
df_groupedByCluster = df_groupedBy.groupby(by=['Cluster', 'Counter'], as_index=False)['Counter'].count()
write_df_into_csv_file(gDrivePath+'/nb_clusters.csv', df_groupedByCluster)




In [None]:
plt.figure(figsize=(20, 15))
plt.bar(np.arange(df_groupedBy.shape[0]), df_groupedBy['Counter'])

In [None]:
### extract the apropriate columns
file_path = gDrivePath+'/fdj_transactions.csv'
df_trans = read_csv_file_into_df(file_path)
columns=['IS_DUPLICATED', 'ACTION','PAYMENT_CARD_CODE', 'SHORT_MESSAGE',
          'EXTERNAL_RETURN_CODE', 'FRAUD_LIST', 'FRAUD_RULE',
          'FRAUD_EXPLANATION', 'FRAUD_ACTION', 'ENROLLED_3D_SECURE',
          'AUTHENTICATED_3D_SECURE', 'SECURITY_LEVEL', 'CHARGE_BACK',
          'IS_CVD', 'TRANSMITTER_COUNTRY', 'BUYER_IP_COUNTRY_CODE',
          'OPERATING_SYSTEM', 'BROWSER']

# columns=['IS_DUPLICATED', 'ACTION', 'PAYMENT_CARD_CODE' 'SHORT_MESSAGE',
#           'EXTERNAL_RETURN_CODE', 'FRAUD_LIST',
#           'FRAUD_EXPLANATION', 'FRAUD_ACTION', 'ENROLLED_3D_SECURE',
#           'AUTHENTICATED_3D_SECURE', 'SECURITY_LEVEL', 'CHARGE_BACK',
#           'IS_CVD']

df_trans = filter_df(df_trans, columns)
df_trans.fillna('vide', inplace=True)
write_df_into_csv_file(gDrivePath+'/fdj_trans_filterd_columns.csv', df_trans)


# df_trans = preprocessing_convert_df_to_numerical_values(df_trans)
# cluster_labels = do_kmeans(df_trans, 5)
# df_trans = do_PCA(df_trans, 2)
# plot_clusters_2D(df_trans, cluster_labels, (20, 15))



In [None]:
### Apply Density based clustring : DBSCAN


In [None]:
## statistics
df_trans = read_csv_file_into_df(gDrivePath+'/fdj_transactions.csv')

df_trans['Counter'] = 1

df_stat_OPERATING_SYSTEM = df_trans.groupby(by=['EXTERNAL_RETURN_CODE'], as_index=False)['Counter'].count()

print(df_stat_OPERATING_SYSTEM)
plt.figure(figsize=(20,30))
plt.scatter(df_stat_OPERATING_SYSTEM.index, df_stat_OPERATING_SYSTEM['Counter'], c=df_stat_OPERATING_SYSTEM.index)
plt.legend()



In [None]:
write_df_into_csv_file(gDrivePath+'/stat_EXTERNAL_RETURN_CODE.csv', df_stat_OPERATING_SYSTEM)

In [None]:
df_stat_OPERATING_SYSTEM

In [None]:
### KMode for clustring categorical data : clustring transactions
### create the dataset that contains the centroids and there clusters
# do training
kmodes_estimator = KModes(n_clusters=4, init='Huang')
cluster_labels = kmodes_estimator.fit_predict(X)

# get centroids
centroids = kmodes_estimator.cluster_centroids_

# get centroids clusters
centroids_cluster = kmodes_estimator.predict(centroids)

# concatenate centroids with there adequate clusters
centroid_dataset = np.concatenate((centroids, centroids_cluster.reshape(centroids_cluster.shape[0], 1)), axis=1)


# create the dataframe that contains the centroids for each cluster with 
# apropriate cluster
columns = list(X.columns)
columns.append('Cluster')
df_centroid = pd.DataFrame(data=centroid_dataset, columns=columns)
write_df_into_csv_file('trans_centroids.csv', df_centroid )


fdj_transactions_with_clusters = np.concatenate((X, cluster_labels.reshape(cluster_labels.shape[0], 1)), axis=1)
columns = list(X.columns).append('Cluster')
fdj_transactions_with_clusters = pd.DataFrame(fdj_transactions_with_clusters,columns=columns)
write_df_into_csv_file('fdj_transaction_with_cluster.csv', fdj_transactions_with_clusters)




In [None]:
### I can't apply KNN because i have categorical data and that's gonna take
### a lot of time from me, instead I am gonna define a similarity function
### to get the distance between two rows


def calculate_similarity(a, b):
  return a.eq(b).sum()


def get_coverage(df_centroids, df_trans_test):
  coverage = df_centroids.shape[0]*[0]
  non_matches = 0 
  for i_index, i_row in df_trans_test.iterrows():
    for j_index, j_row in df_centroid.iterrows():
      similarity = calculate_similarity(i_row, j_row)
      if similarity == 0 :
        non_matches += 1
      else:
        coverage[j_index] += similarity
  return coverage, non_matches




df_trans_test = read_csv_file_into_df('fdj_trans_test.csv')
df_centroids = read_csv_file_into_df('trans_centroids.csv')
df_centroids = df_centroid.filter(items=df_trans_test.columns)



coverage, non_matches = get_coverage(df_centroids, df_trans_test)

print(f'Coverage : {coverage}, \n Non matches {non_matches}')





In [None]:
### Silhouette metric for KModes (personal implementation)
# you should call the function like that (see below)
# calculate_dissimilarity(X[0:1].reset_index(drop=True), X[2:3].reset_index(drop=True)) 
def calculate_dissimilarity(a, b):
  """
  Calculate the dissimlarity between two rows of a dataframe
  a : row 1 (vecteur)
  b : row 2 (vecteur)
  return : value of dissimlarity
  """
  return np.sum(a!=b)

def kmodes_silhouette_sample_score(X, cluster_labels):
  clusterd_dataset = np.concatenate((X, cluster_labels.reshape(cluster_labels.shape[0], 1)), axis=1)
  
  print('shape clustred dataset : ', clusterd_dataset.shape)
  print(clusterd_dataset[:,-1:])
  for i in set(cluster_labels):
    samples_ith_cluster = np.where(clusterd_dataset[:,-1:] == i)
    print(samples_ith_cluster)
  # scores_list = np.array([])
  # for i_index, i_row in X.iterrows():
  #   a = 0 # distance inter_cluster
  #   nb_inter_cluster = 1
  #   b = 0 # distance intra_cluster
  #   for j_index, j_row in X.iterrows():
  #     if i_index != j_index:
  #       if cluster_labels[i_index] == cluster_labels[j_index]:
  #         a += calculate_dissimilarity(i_row, j_row)
  #         nb_inter_cluster += 1
  #       else:
  #         b += calculate_dissimilarity(i_row, j_row)
    
  #   a /= nb_inter_cluster
  #   b /= (X.shape[0] - nb_inter_cluster)
  #   scores_list = np.append(scores_list,[(b - a) / max(b, a)])
  # return scores_list

def kmodes_silhouette_samples_mean(X, cluster_labels):
  samples_avg_score = []
  scores_list = kmodes_silhouette_sample_score(X, cluster_labels)
  # find all sample'score for the ith cluster
  for i in list(set(cluster_labels)):
    same_cluster_samples_scores = scores_list[i == cluster_labels]
    samples_avg_score.append(same_cluster_samples_scores.sum() / same_cluster_samples_scores.size)
  return samples_avg_score




In [None]:
## don't run it

# silhouette technique useful just for KMeans not KModes because we need to have numeric values
# Find the perfect K for the KMeans : Example for test
# to generate a samples for testing the Kmeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
# to use the silhouette coefficient (the more the coefficient is heigh 
# more the clusters are good)
from sklearn.metrics import silhouette_samples, silhouette_score
# for chosing the color
import matplotlib.cm as cm


kmeans_estimator = KMeans(n_clusters=5)


n_clusters_range = [2, 3]

y_start = 10
for n_cluster in n_clusters_range:
    fig, (ax) = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)

    ax.set_xlim([-0.1, 1])
    

    kmode_estimator = KModes(n_clusters=n_cluster, random_state=10)
    # get the labels (the cluster for each sample)
    cluster_labels = kmode_estimator.fit_predict(X)
    # test something
    centroids = kmode_estimator.cluster_centroids_
    perdiction_centroids = kmode_estimator.predict(centroids)
    # calculate the silhouette score for each sample
    samples_silhouette_score = kmodes_silhouette_sample_score(X, cluster_labels)
    print('type samples output: ', type(samples_silhouette_score))

    # calculate mean score for each class 
    silhouette_avg = kmodes_silhouette_samples_mean(X, cluster_labels)

    for i in range(n_cluster):
      # get the silhouette score values for the cluster number i
      ith_cluster_samples_silhouette_score = samples_silhouette_score[cluster_labels == i]
      ith_cluster_samples_silhouette_score.sort()

      y_end = y_start + ith_cluster_samples_silhouette_score.size
      color = cm.nipy_spectral(float(i) / n_cluster)
      ax.fill_betweenx(np.arange(y_start, y_end),
                          0, ith_cluster_samples_silhouette_score,
                          facecolor=color, edgecolor=color, alpha=0.7)
      
      # Label the silhouette plots with their cluster numbers at the middle
      size_cluster_i = ith_cluster_samples_silhouette_score.size
      ax.text(-0.05, y_start + 0.5 * size_cluster_i, str(i))
      ax.set_title("The silhouette plot for the various clusters.")
      ax.set_xlabel("The silhouette coefficient values")
      ax.set_ylabel("Cluster label")

      # The vertical line for average silhouette score of all the values
      ax.axvline(x=silhouette_avg, color="red", linestyle="--")

      ax.set_yticks([])  # Clear the yaxis labels / ticks
      ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

      y_start = y_end + 10

      plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_cluster),
                 fontsize=14, fontweight='bold')

plt.show()

In [None]:
## scripts for test

X = pd.DataFrame([['a', 'b', 'c', 'd'],
                  ['a', 'b', 'c', 'g'],
                  ['a', 'b', 'd', 'k'],
                  ['a', 'b', 'c', 'd'],
                  ['a', 'b', 'c', 'g'],
                  ['a', 'b', 'd', 'k']], columns=['A', 'B', 'C', 'D'])

kmodes = KModes(n_clusters=2, random_state=10)
cluster_labels = kmodes.fit_predict(X)

new_one = np.concatenate((X, cluster_labels.reshape(cluster_labels.shape[0], 1)), axis=1)
another = np.where(new_one[:,-2:-1] == 'd')
for i, j in zip(another[0], another[1]):
  print(new_one[i:i+1])
# print(kmodes_silhouette_sample_score(X, cluster_labels))

# print(kmodes_silhouette_samples_mean(X, cluster_labels))

In [None]:
# Exemple type
# silhouette technique useful just for KMeans not KModes because we need to have numeric values
# Find the perfect K for the KMeans : Example for test
# to generate a samples for testing the Kmeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
# to use the silhouette coefficient (the more the coefficient is heigh 
# more the clusters are good)
from sklearn.metrics import silhouette_samples, silhouette_score
# for chosing the color
import matplotlib.cm as cm

# for kmeans
from sklearn.cluster import KMeans

df_trans = read_csv_file_into_df('fdj_clustred_data.csv')
df_trans.drop(columns=['Cluster'], inplace=True)

n_clusters_range = [5, 10, 15, 20, 25, 30, 40]

y_start = 10
for n_cluster in n_clusters_range:
    fig, (ax) = plt.subplots(1, 1)
    fig.set_size_inches(10, 7)

    ax.set_xlim([-0.1, 1])
    

    kmean_estimator = KMeans(n_clusters=n_cluster, random_state=10)
    # get the labels (the cluster for each sample)
    cluster_labels = kmean_estimator.fit_predict(df_trans)
    # calculate the silhouette score for each sample
    samples_silhouette_score = silhouette_samples(df_trans, cluster_labels)
    print('type samples output: ', type(samples_silhouette_score))

    # calculate mean score for each class 
    silhouette_avg = silhouette_score(X, cluster_labels)

    for i in range(n_cluster):
      # get the silhouette score values for the cluster number i
      ith_cluster_samples_silhouette_score = samples_silhouette_score[cluster_labels == i]
      ith_cluster_samples_silhouette_score.sort()

      y_end = y_start + ith_cluster_samples_silhouette_score.size
      color = cm.nipy_spectral(float(i) / n_cluster)
      ax.fill_betweenx(np.arange(y_start, y_end),
                          0, ith_cluster_samples_silhouette_score,
                          facecolor=color, edgecolor=color, alpha=0.7)
      
      # Label the silhouette plots with their cluster numbers at the middle
      size_cluster_i = ith_cluster_samples_silhouette_score.size
      ax.text(-0.05, y_start + 0.5 * size_cluster_i, str(i))
      ax.set_title("The silhouette plot for the various clusters.")
      ax.set_xlabel("The silhouette coefficient values")
      ax.set_ylabel("Cluster label")

      # The vertical line for average silhouette score of all the values
      ax.axvline(x=silhouette_avg, color="red", linestyle="--")

      ax.set_yticks([])  # Clear the yaxis labels / ticks
      ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

      y_start = y_end + 10

      plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_cluster),
                 fontsize=14, fontweight='bold')

plt.show()


In [None]:
# Exemple type
# silhouette technique useful just for KMeans not KModes because we need to have numeric values
# Find the perfect K for the KMeans : Example for test
# to generate a samples for testing the Kmeans
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
# to use the silhouette coefficient (the more the coefficient is heigh 
# more the clusters are good)
from sklearn.metrics import silhouette_samples, silhouette_score
# for chosing the color
import matplotlib.cm as cm

# for kmeans
from sklearn.cluster import KMeans

X, y = make_blobs(n_samples=200000,
                  n_features=2,
                  centers=4,
                  cluster_std=1,
                  center_box=(-10.0, 10.0),
                  shuffle=True,
                  random_state=1)  # For reproducibility

n_clusters_range = [2, 3, 4, 5, 6]




y_start = 10
for n_cluster in n_clusters_range:
    fig, (ax) = plt.subplots(1, 1)
    fig.set_size_inches(10, 7)

    ax.set_xlim([-0.1, 1])
    

    kmean_estimator = KMeans(n_clusters=n_cluster, random_state=10)
    # get the labels (the cluster for each sample)
    cluster_labels = kmean_estimator.fit_predict(X)
    # calculate the silhouette score for each sample
    samples_silhouette_score = silhouette_samples(X, cluster_labels)
    print('type samples output: ', type(samples_silhouette_score))

    # calculate mean score for each class 
    silhouette_avg = silhouette_score(X, cluster_labels)

    for i in range(n_cluster):
      # get the silhouette score values for the cluster number i
      ith_cluster_samples_silhouette_score = samples_silhouette_score[cluster_labels == i]
      ith_cluster_samples_silhouette_score.sort()

      y_end = y_start + ith_cluster_samples_silhouette_score.size
      color = cm.nipy_spectral(float(i) / n_cluster)
      ax.fill_betweenx(np.arange(y_start, y_end),
                          0, ith_cluster_samples_silhouette_score,
                          facecolor=color, edgecolor=color, alpha=0.7)
      
      # Label the silhouette plots with their cluster numbers at the middle
      size_cluster_i = ith_cluster_samples_silhouette_score.size
      ax.text(-0.05, y_start + 0.5 * size_cluster_i, str(i))
      ax.set_title("The silhouette plot for the various clusters.")
      ax.set_xlabel("The silhouette coefficient values")
      ax.set_ylabel("Cluster label")

      # The vertical line for average silhouette score of all the values
      ax.axvline(x=silhouette_avg, color="red", linestyle="--")

      ax.set_yticks([])  # Clear the yaxis labels / ticks
      ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

      y_start = y_end + 10

      plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_cluster),
                 fontsize=14, fontweight='bold')

plt.show()


In [None]:
### Ploting the clusters 

#Importing required modules
 
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
 
#Load Data
data = load_digits().data
pca = PCA(2)
 

#Transform the data
df = pca.fit_transform(data)
 


In [None]:
#Import required module
from sklearn.cluster import KMeans
 
#Initialize the class object
kmeans = KMeans(n_clusters= 10)
 
#predict the labels of clusters.
label = kmeans.fit_predict(df)
 
print(label)

In [None]:
import matplotlib.pyplot as plt
 
#Getting unique labels
 
u_labels = np.unique(label)
 
#plotting the results:

plt.figure(figsize=(20,5))
for i in u_labels:
    plt.scatter(df[label == i , 0] , df[label == i , 1] , label = i)
plt.legend()
plt.show()

### Mapping the dataset

In [None]:
### Mapping categorial variables to numeric ones

# Attention : don't forget to replace all the values in external_return_code by a value different from the one you assigned to 0
# Attention : supprimer le IS_CVD
# Attention : Pour transmitter_country, remplacer tout sauf FRA par 2 
# Attention : Supprimer l'Operating et le browser

mapping_columns = {
    'ACTION': {
      'AUTHOR+CAP':  1,
      'AUTHOR':  2,
      'RESET':  3,
      'REFUND':  4
    },
    'PAYMENT_CARD_CODE':{
      'CB': 1,
      'PAYPAL': 2,
      'PAYSAFECARD2':3,
      'SKRILL(MONEYBOOKERS)': 4
    },
    'SHORT_MESSAGE':{
      'REFUSED': 4,
      'ACCEPTED': 1,
      'ERROR': 4,
      'CANCELLED': 3,
      'ONHOLD_PARTNER': 2
    },
    'FRAUD_LIST':{
      'StandardList': 1,
      'NewCustomerList': 1,
      'BlackList': 2
    },
    'FRAUD_RULE':{
      'Cartes Business': 1,
      'Cartes Corporate': 1,
      'Montant 3DS-v2-FLS': 2,
      'Cumul7j 3DS-v2-NCH': 3,
      'OneCardNCustomers': 4,
      'ListCountryIssuer': 5,
      'Unicité du compte Paypal par compte joueur': 6,
      'Paysafecard - Cumul client > 1000 EUR par jour calendaire': 7,
      'Unicité du compte Skrill par compte joueur': 8,
      'Paysafecard - Cumul client > 10000 EUR par mois': 9
    },
    'FRAUD_EXPLANATION': {
      'AFF': 1,
      '3DS': 1,
      'CUC': 1,
      'UNI': 1,
      'CTY': 1,
      'PSF': 1
    },
    'FRAUD_ACTION': {
      'BLOCK': 5,
      'FRICTIONLESS': 3,
      'NONE': 1,
      'NO_PREFERENCE': 4,
      'noAction': 2
    },
    'ENROLLED_3D_SECURE':{
      'Y': 1,
      'N': 3
    },
    'AUTHENTICATED_3D_SECURE': {
      'Y': 1,
      'R': 2,
      'N': 6,
      'A': 3,
      'U': 5
    },
    'SECURITY_LEVEL': {
      'CVV': 1,
      '3DS': 2,
      'Aucun': 3
    },
    'CHARGE_BACK': {
        'N': 1
    },
    'TRANSMITTER_COUNTRY':{
        'FRA': 1
    },
    'BUYER_IP_COUNTRY_CODE':{
        'FR': 1
    }
}
