In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

In [3]:
data = [["TransactionID", "ClientID", "Profession", "Bank_dep", "Risk", "Number of credits", "Revenue"],
[1, 231, "self-employed", "009", "high", 2, 30200],
[2, 765, "students", "005", "high", 3,12700],
[3, 453, "horeca", "007", "medium", 5, 89400],
[4, 231, "self-employed", "009", "high", 2, 30200],
[5, 892, "finance", "003", "low", 3, 740000]]

In [65]:
df = pd.DataFrame(data)
df.columns = df.iloc[0]
df.drop(df.index[0], inplace=True)

In [15]:
df

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
1,1,231,self-employed,9,high,2,30200
2,2,765,students,5,high,3,12700
3,3,453,horeca,7,medium,5,89400
4,4,231,self-employed,9,high,2,30200
5,5,892,finance,3,low,3,740000


#### *Dummy Encoder* 
Function:  
`pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)`

When to use:  
Convert categorical variable into dummy/indicator variables.

In [19]:
enc_columns = ["Profession", "Bank_dep", "Risk"]

In [20]:
df_dummy = pd.get_dummies(df, prefix="Dummmy", columns=enc_columns)

In [21]:
df_dummy

Unnamed: 0,TransactionID,ClientID,Number of credits,Revenue,Dummmy_finance,Dummmy_horeca,Dummmy_self-employed,Dummmy_students,Dummmy_003,Dummmy_005,Dummmy_007,Dummmy_009,Dummmy_high,Dummmy_low,Dummmy_medium
1,1,231,2,30200,0,0,1,0,0,0,0,1,1,0,0
2,2,765,3,12700,0,0,0,1,0,1,0,0,1,0,0
3,3,453,5,89400,0,1,0,0,0,0,1,0,0,0,1
4,4,231,2,30200,0,0,1,0,0,0,0,1,1,0,0
5,5,892,3,740000,1,0,0,0,1,0,0,0,0,1,0


#### *Label Encoder*
Function:  
`sklearn.preprocessing.LabelEncoder`

When to use:  
Encode target labels with value between 0 and n_classes-1.

In [42]:
label_enc = LabelEncoder()

In [48]:
label_enc_df = df
label_enc_df['Profession'] = label_enc.fit_transform(df['Profession'])
label_enc_df['Bank_dep'] = label_enc.fit_transform(df['Bank_dep'])
label_enc_df['Risk'] = label_enc.fit_transform(df['Risk'])
label_enc_df

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
1,1,231,2,3,0,2,30200
2,2,765,3,1,0,3,12700
3,3,453,1,2,2,5,89400
4,4,231,2,3,0,2,30200
5,5,892,0,0,1,3,740000


##### *Backward Difference Coding*
Function:  
`category_encoders.backward_difference.BackwardDifferenceEncoder(verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value')`

When to use:  
The mean of the dependent variable for a level is compared with the mean of the dependent variable for the prior level.

In [41]:
ce_bdc = ce.BackwardDifferenceEncoder(cols=enc_columns)
ce_bdc_df = ce_bdc.fit_transform(df)
ce_bdc_df



Unnamed: 0,intercept,TransactionID,ClientID,Profession_0,Profession_1,Profession_2,Bank_dep_0,Bank_dep_1,Bank_dep_2,Risk_0,Risk_1,Number of credits,Revenue
1,1,1,231,-0.75,-0.5,-0.25,-0.75,-0.5,-0.25,-0.666667,-0.333333,2,30200
2,1,2,765,0.25,-0.5,-0.25,0.25,-0.5,-0.25,-0.666667,-0.333333,3,12700
3,1,3,453,0.25,0.5,-0.25,0.25,0.5,-0.25,0.333333,-0.333333,5,89400
4,1,4,231,-0.75,-0.5,-0.25,-0.75,-0.5,-0.25,-0.666667,-0.333333,2,30200
5,1,5,892,0.25,0.5,0.75,0.25,0.5,0.75,0.333333,0.666667,3,740000


#### *BaseN*

Function:  
`category_encoders.basen.BaseNEncoder(verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, base=2, handle_unknown='value', handle_missing='value')`

When to use:  
Base-N encoder encodes the categories into arrays of their base-N representation. A base of 1 is equivalent to one-hot encoding (not really base-1, but useful), a base of 2 is equivalent to binary encoding. N=number of actual categories is equivalent to vanilla ordinal encoding.

In [49]:
ce_bn = ce.BaseNEncoder(cols=enc_columns)
ce_bn_df = ce_bn.fit_transform(df)
ce_bn_df

Unnamed: 0,TransactionID,ClientID,Profession_0,Profession_1,Profession_2,Bank_dep_0,Bank_dep_1,Bank_dep_2,Risk_0,Risk_1,Number of credits,Revenue
1,1,231,0,0,1,0,0,1,0,1,2,30200
2,2,765,0,1,0,0,1,0,0,1,3,12700
3,3,453,0,1,1,0,1,1,1,0,5,89400
4,4,231,0,0,1,0,0,1,0,1,2,30200
5,5,892,1,0,0,1,0,0,1,1,3,740000


#### *Binary*

Function:  
`category_encoders.binary.BinaryEncoder(verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, *, base=2, handle_unknown='value', handle_missing='value')`

When to use:  
Binary encoding for categorical variables, similar to onehot, but stores categories as binary bitstrings.

In [50]:
ce_binary = ce.BinaryEncoder(cols=enc_columns)
ce_binary_df = ce_binary.fit_transform(df)
ce_binary_df

Unnamed: 0,TransactionID,ClientID,Profession_0,Profession_1,Profession_2,Bank_dep_0,Bank_dep_1,Bank_dep_2,Risk_0,Risk_1,Number of credits,Revenue
1,1,231,0,0,1,0,0,1,0,1,2,30200
2,2,765,0,1,0,0,1,0,0,1,3,12700
3,3,453,0,1,1,0,1,1,1,0,5,89400
4,4,231,0,0,1,0,0,1,0,1,2,30200
5,5,892,1,0,0,1,0,0,1,1,3,740000


#### *CatBoost Encoder*

Function:  
`category_encoders.cat_boost.CatBoostEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', random_state=None, sigma=None, a=1)`

When to use:  
This is very similar to leave-one-out encoding, but calculates the values “on-the-fly”. Consequently, the values naturally vary during the training phase and it is not necessary to add random noise.

In [67]:
ce_cbe = ce.CatBoostEncoder(cols=['Risk'])
ce_cbe_df = ce_cbe.fit_transform(df, df['Revenue'])
ce_cbe_df

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
1,1,231,self-employed,9,180500.0,2,30200
2,2,765,students,5,105350.0,3,12700
3,3,453,horeca,7,180500.0,5,89400
4,4,231,self-employed,9,74466.666667,2,30200
5,5,892,finance,3,180500.0,3,740000


#### *Count Encoder*

Function:  
`category_encoders.count.CountEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', min_group_size=None, combine_min_nan_groups=None, min_group_name=None, normalize=False)`

When to use:  
For a given categorical feature, replace the names of the groups with the group counts.

In [77]:
ce_ce = ce.CountEncoder(cols=enc_columns)
ce_ce_df = ce_ce.fit_transform(df)
ce_ce_df

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
1,1,231,2,2,3,2,30200
2,2,765,1,1,3,3,12700
3,3,453,1,1,1,5,89400
4,4,231,2,2,3,2,30200
5,5,892,1,1,1,3,740000


#### *Generalized Linear Mixed Model Encoder*

Function:  
`category_encoders.glmm.GLMMEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', random_state=None, randomized=False, sigma=0.05, binomial_target=None)`

When to use:  
This is a supervised encoder similar to TargetEncoder or MEstimateEncoder, but more robust.

In [79]:
ce_glmme = ce.GLMMEncoder(cols=enc_columns)
ce_glmme_df = ce_glmme.fit_transform(df, df['Revenue'])
ce_glmme_df

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
1,1,231,-187875.010232,-187875.010232,-260127.928347,2,30200
2,2,765,-205375.01045,-205375.01045,-260127.928347,3,12700
3,3,453,-128675.010445,-128675.010445,-195023.921602,5,89400
4,4,231,-187875.010232,-187875.010232,-260127.928347,2,30200
5,5,892,521925.051956,521925.051956,455151.849949,3,740000


#### *Hashing*

Function:  
`category_encoders.hashing.HashingEncoder(max_process=0, max_sample=0, verbose=0, n_components=8, cols=None, drop_invariant=False, return_df=True, hash_method='md5')`

When to use:  
A multivariate hashing implementation with configurable dimensionality/precision.
The advantage of this encoder is that it does not maintain a dictionary of observed categories. Consequently, the encoder does not grow in size and accepts new values during data scoring by design.   
The number of dimensions will be far less than the number of dimensions with encoding like One Hot Encoding. This method is advantageous when the cardinality of categorical is very high.

In [82]:
ce_he = ce.HashingEncoder(cols=enc_columns)
ce_he_df = ce_he.fit_transform(df)
ce_he_df

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,TransactionID,ClientID,Number of credits,Revenue
1,0,0,0,1,0,1,0,1,1,231,2,30200
2,0,1,0,1,0,1,0,0,2,765,3,12700
3,0,1,0,1,0,1,0,0,3,453,5,89400
4,0,0,0,1,0,1,0,1,4,231,2,30200
5,1,0,0,0,0,1,0,1,5,892,3,740000


#### *Helmert Coding*

Function:  
`category_encoders.helmert.HelmertEncoder(verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value')`

When to use:  
Helmert contrast coding for encoding categorical features. The mean of the dependent variable for a level is compared to the mean of the dependent variable over all previous levels.

In [85]:
ce_helmert = ce.HelmertEncoder(cols=enc_columns)
ce_helmert_df = ce_helmert.fit_transform(df)
ce_helmert_df



Unnamed: 0,intercept,TransactionID,ClientID,Profession_0,Profession_1,Profession_2,Bank_dep_0,Bank_dep_1,Bank_dep_2,Risk_0,Risk_1,Number of credits,Revenue
1,1,1,231,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,30200
2,1,2,765,1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,3,12700
3,1,3,453,0.0,2.0,-1.0,0.0,2.0,-1.0,1.0,-1.0,5,89400
4,1,4,231,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,2,30200
5,1,5,892,0.0,0.0,3.0,0.0,0.0,3.0,0.0,2.0,3,740000


#### *James-Stein Encoder*

Function:  
`category_encoders.james_stein.JamesSteinEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', model='independent', random_state=None, randomized=False, sigma=0.05)`

When to use:  
For feature value, the James-Stein estimator returns a weighted average of:

The mean target value for the observed feature value.
The mean target value (regardless of the feature value).
  
The James-Stein encoder shrinks the average toward the overall average. It is a target based encoder. James-Stein estimator has, however, one practical limitation — it was defined only for normal distributions.

In [87]:
ce_jse = ce.JamesSteinEncoder(cols=enc_columns)
ce_jse_df = ce_jse.fit_transform(df, df['Revenue'])
ce_jse_df

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
1,1,231,30200.0,30200.0,24366.666667,2,30200
2,2,765,12700.0,12700.0,24366.666667,3,12700
3,3,453,89400.0,89400.0,89400.0,5,89400
4,4,231,30200.0,30200.0,24366.666667,2,30200
5,5,892,740000.0,740000.0,740000.0,3,740000


#### *Leave One Out*

Function:  
`category_encoders.leave_one_out.LeaveOneOutEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', random_state=None, sigma=None)`

When to use:  
This is very similar to target encoding but excludes the current row’s target when calculating the mean target for a level to reduce the effect of outliers.

In [89]:
ce_loo = ce.LeaveOneOutEncoder(cols=enc_columns)
ce_loo_df = ce_loo.fit_transform(df, df['Revenue'])
ce_loo_df

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
1,1,231,30200.0,30200.0,21450.0,2,30200
2,2,765,180500.0,180500.0,30200.0,3,12700
3,3,453,180500.0,180500.0,180500.0,5,89400
4,4,231,30200.0,30200.0,21450.0,2,30200
5,5,892,180500.0,180500.0,180500.0,3,740000


#### *M-estimate*

Function:  
`category_encoders.m_estimate.MEstimateEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', random_state=None, randomized=False, sigma=0.05, m=1.0)` 


When to use:  
M-probability estimate of likelihood.  
  

M-Estimate Encoder is a simplified version of Target Encoder. It has only one hyper-parameter — m, which represents the power of regularization. The higher the value of m results, into stronger the shrinking. Recommended values for m is in the range of 1 to 100.

In [91]:
ce_mee = ce.MEstimateEncoder(cols=enc_columns)
ce_mee_df = ce_mee.fit_transform(df, df['Revenue'])
ce_mee_df

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
1,1,231,80300.0,80300.0,63400.0,2,30200
2,2,765,96600.0,96600.0,63400.0,3,12700
3,3,453,134950.0,134950.0,134950.0,5,89400
4,4,231,80300.0,80300.0,63400.0,2,30200
5,5,892,460250.0,460250.0,460250.0,3,740000


#### *One Hot*

Function:  
`category_encoders.one_hot.OneHotEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', handle_unknown='value', use_cat_names=False)`

When to use:  
Onehot (or dummy) coding for categorical features, produces one feature per category, each binary.

In [94]:
ce_ohe = ce.OneHotEncoder(cols=enc_columns)
ce_ohe_df = ce_ohe.fit_transform(df)
ce_ohe_df

Unnamed: 0,TransactionID,ClientID,Profession_1,Profession_2,Profession_3,Profession_4,Bank_dep_1,Bank_dep_2,Bank_dep_3,Bank_dep_4,Risk_1,Risk_2,Risk_3,Number of credits,Revenue
1,1,231,1,0,0,0,1,0,0,0,1,0,0,2,30200
2,2,765,0,1,0,0,0,1,0,0,1,0,0,3,12700
3,3,453,0,0,1,0,0,0,1,0,0,1,0,5,89400
4,4,231,1,0,0,0,1,0,0,0,1,0,0,2,30200
5,5,892,0,0,0,1,0,0,0,1,0,0,1,3,740000


#### *Ordinal*

Function:   
`category_encoders.ordinal.OrdinalEncoder(verbose=0, mapping=None, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value')`

When to use:  
Encodes categorical features as ordinal, in one ordered feature.

Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed in; in this case, we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes are assumed to have no true order and integers are selected at random.

In [95]:
ce_ode = ce.OrdinalEncoder(cols=enc_columns)
ce_ode_df = ce_ode.fit_transform(df)
ce_ode_df

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
1,1,231,1,1,1,2,30200
2,2,765,2,2,1,3,12700
3,3,453,3,3,2,5,89400
4,4,231,1,1,1,2,30200
5,5,892,4,4,3,3,740000


#### *Polynomial Coding*

Function:  
`category_encoders.polynomial.PolynomialEncoder(verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value')`

When to use:  
The categorical variable here is assumed to be represented by an underlying, equally spaced numeric variable. Therefore, this type of encoding is used only for ordered categorical variables with equal spacing. In general, the polynomial contrast produces polynomials of order k-1.

In [98]:
ce_poly = ce.PolynomialEncoder(cols=enc_columns)
ce_poly_df = ce_poly.fit_transform(df,)
ce_poly_df



Unnamed: 0,intercept,TransactionID,ClientID,Profession_0,Profession_1,Profession_2,Bank_dep_0,Bank_dep_1,Bank_dep_2,Risk_0,Risk_1,Number of credits,Revenue
1,1,1,231,-0.67082,0.5,-0.223607,-0.67082,0.5,-0.223607,-0.7071068,0.408248,2,30200
2,1,2,765,-0.223607,-0.5,0.67082,-0.223607,-0.5,0.67082,-0.7071068,0.408248,3,12700
3,1,3,453,0.223607,-0.5,-0.67082,0.223607,-0.5,-0.67082,-5.5511150000000004e-17,-0.816497,5,89400
4,1,4,231,-0.67082,0.5,-0.223607,-0.67082,0.5,-0.223607,-0.7071068,0.408248,2,30200
5,1,5,892,0.67082,0.5,0.223607,0.67082,0.5,0.223607,0.7071068,0.408248,3,740000


#### *Sum Coding*

Function:  
`category_encoders.sum_coding.SumEncoder(verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value')`

When to use:  
Sum encoding is similar to one-hot encoding but the difference is that in sum encoding we take one value as '-1' and it is not compared to other value. Whereas in one-hot encoding we create one column for each value to compare against all other values.

In [100]:
ce_sum = ce.SumEncoder(cols=enc_columns)
ce_sum_df = ce_sum.fit_transform(df)
ce_sum_df



Unnamed: 0,intercept,TransactionID,ClientID,Profession_0,Profession_1,Profession_2,Bank_dep_0,Bank_dep_1,Bank_dep_2,Risk_0,Risk_1,Number of credits,Revenue
1,1,1,231,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2,30200
2,1,2,765,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3,12700
3,1,3,453,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,5,89400
4,1,4,231,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2,30200
5,1,5,892,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,3,740000


#### *Target Encoder*

Function:  
`category_encoders.target_encoder.TargetEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', handle_unknown='value', min_samples_leaf=1, smoothing=1.0, hierarchy=None)`

When to use:  
For the case of categorical target: features are replaced with a blend of posterior probability of the target given particular categorical value and the prior probability of the target over all the training data.  
  
For the case of continuous target: features are replaced with a blend of the expected value of the target given particular categorical value and the expected value of the target over all the training data.


In [103]:
ce_te = ce.TargetEncoder(cols=enc_columns)
ce_te_df = ce_te.fit_transform(df, df['Revenue'])
ce_te_df



Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
1,1,231,70621.895632,70621.895632,42978.216225,2,30200
2,2,765,180500.0,180500.0,42978.216225,3,12700
3,3,453,180500.0,180500.0,180500.0,5,89400
4,4,231,70621.895632,70621.895632,42978.216225,2,30200
5,5,892,180500.0,180500.0,180500.0,3,740000


#### *Weight of Evidence*

Function:  
`category_encoders.woe.WOEEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value', random_state=None, randomized=False, sigma=0.05, regularization=1.0)`

When to use:  
Weight of Evidence (WoE) measures the “strength” of a grouping technique to separate good and bad. This method was developed primarily to build a predictive model to evaluate the risk of loan default in the credit and financial industry. Weight of evidence (WOE) measures how much the evidence supports or undermines a hypothesis.

In [106]:
ce_woe = ce.WOEEncoder(cols=enc_columns)
ce_woe_df = ce_woe.fit_transform(df, df['Revenue'])
ce_woe_df

ValueError: The target column y must be binary. But the target contains 4 unique value(s).

#### *Wrappers*

Function:  
`category_encoders.wrapper.PolynomialWrapper(feature_encoder)`

When to use:  
The label is first encoded into n-1 binary columns. Subsequently, the inner supervised encoder is executed for each binarized label.  
  
The names of the encoded features are suffixed with underscore and the corresponding class name (edge scenarios like ‘dog’+’cat_frog’ vs. ‘dog_cat’+’frog’ are not currently handled).  
  
The implementation is experimental and the API may change in the future. The order of the returned features may change in the future.

In [108]:
ce_pw = ce.PolynomialWrapper(cols=enc_columns)
ce_pw_df = ce_pw.fit_transform(df, df['Revenue'])
ce_pw_df

AttributeError: module 'category_encoders' has no attribute 'PolynomialWrapper'

#### *Quantile Encoder*

Function:  
`category_encoders.quantile_encoder.QuantileEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', handle_unknown='value', quantile=0.5, m=1.0)`

When to use:  
This a statistically modified version of target MEstimate encoder where selected features are replaced by the statistical quantile instead of the mean. Replacing with the median is a particular case where self.quantile = 0.5. In comparison to MEstimateEncoder it has two tunable parameter m and quantile

In [111]:
ce_qe = ce.QuantileEncoder(cols=enc_columns)
ce_qe_df = ce_qe.fit_transform(df, df['Revenue'])
ce_qe_df

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
1,1,231,30200.0,30200.0,30200.0,2,30200
2,2,765,21450.0,21450.0,30200.0,3,12700
3,3,453,59800.0,59800.0,59800.0,5,89400
4,4,231,30200.0,30200.0,30200.0,2,30200
5,5,892,385100.0,385100.0,385100.0,3,740000


#### *Summary Encoder*

Function:  
`category_encoders.quantile_encoder.SummaryEncoder(verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', handle_unknown='value', quantiles=(0.25, 0.75), m=1.0)`

When to use:  
It’s an encoder designed for creating richer representations by applying quantile encoding for a set of quantiles.

In [113]:
ce_sume = ce.SummaryEncoder(cols=enc_columns)
ce_sume_df = ce_sume.fit_transform(df, df['Revenue'])
ce_sume_df

Unnamed: 0,TransactionID,ClientID,Profession_25,Profession_75,Bank_dep_25,Bank_dep_75,Risk_25,Risk_75,Number of credits,Revenue
1,1,231,30200.0,49933.333333,30200.0,49933.333333,23637.5,45000.0,2,30200
2,2,765,21450.0,51050.0,21450.0,51050.0,23637.5,45000.0,3,12700
3,3,453,59800.0,89400.0,59800.0,89400.0,59800.0,89400.0,5,89400
4,4,231,30200.0,49933.333333,30200.0,49933.333333,23637.5,45000.0,2,30200
5,5,892,385100.0,414700.0,385100.0,414700.0,385100.0,414700.0,3,740000


In [107]:
df

Unnamed: 0,TransactionID,ClientID,Profession,Bank_dep,Risk,Number of credits,Revenue
1,1,231,self-employed,9,high,2,30200
2,2,765,students,5,high,3,12700
3,3,453,horeca,7,medium,5,89400
4,4,231,self-employed,9,high,2,30200
5,5,892,finance,3,low,3,740000
