## IMPORTING THE REQUIRED LIBRARIES

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

## IGNORING THE WARNINGS FROM THE PANDAS LIBRARY

In [2]:
from warnings import simplefilter

simplefilter(action = 'ignore', category = Warning)

## IMPORTING THE DATA

In [3]:
numerical = pd.read_csv('files_for_lab/numerical.csv')
categorical = pd.read_csv('files_for_lab/categorical.csv')
target = pd.read_csv('files_for_lab/target.csv')

## CONVERTING THE CATEGORICAL VALUES IN REAL OBJECTS

In [4]:
categorical = categorical.astype('object')

## MERGING THE NUMERICAL, CATEGORICAL AND TARGET DATAFRAMES IN A UNIQUE ONE

In [5]:
customer_churn = pd.concat([numerical, categorical, target], axis = 1)

## SELECTING ONLY THE USERS WITH A VALUE OF YES FOR THE TARGET_B COLUMN

In [6]:
customer_churn_yes = customer_churn[customer_churn['TARGET_B'] == 1.00].drop('TARGET_B', axis = 1)

## ENCODING ALL THE CATEGORICAL COLUMNS INTO NUMBERS THAT GOES FROM 0 TO THE LENGTH OF THE UNIQUE VALUES

In [7]:
categorical_columns = customer_churn_yes.select_dtypes(object).columns.tolist()

for column in categorical_columns:
    encoder = LabelEncoder().fit(customer_churn_yes[column])
    customer_churn_yes[column] = encoder.transform(customer_churn_yes[column])

## PERFORMING THE X-Y TRAIN TEST SPLIT IN THE CUSTOMER DATAFRAME

In [8]:
X = customer_churn_yes.drop('TARGET_D', axis = 1)
y = customer_churn_yes['TARGET_D']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

## WITH RANDOM FORESTS IS NOT NECESSARY, BUT LET'S SCALE THE DATA

In [9]:
for column in X_train.columns:

    if column not in categorical_columns:
        
        scaler = StandardScaler().fit(X_train[column].values.reshape(-1,1))
        X_train[column] = scaler.transform(X_train[column].values.reshape(-1,1))
        X_test[column] = scaler.transform(X_test[column].values.reshape(-1,1))

## TRAINING RANDOM FOREST REGRESSOR MODEL TO PREDICT THE TARGET_D

In [10]:
rf_clf = RandomForestRegressor()
rf_clf.fit(X_train, y_train)

RandomForestRegressor()

# EXTRACTING AND EVALUATING THE METRICS OF THE MODEL

In [11]:
def model_metrics(model, testing_x, testing_y):

    predictions = model.predict(testing_x)

    r2 = r2_score(testing_y, predictions)
    mae = mean_absolute_error(testing_y, predictions)
    mse = mean_squared_error(testing_y, predictions)

    return r2, mae, mse

model_r2, model_mae, model_mse = model_metrics(rf_clf, X_test, y_test)

print('\nRANDOM FOREST REGRESSOR METRICS')
print('-------------------------------\n')

print(f'- R2: {round(model_r2, 2)}')
print(f'- MAE: {round(model_mae, 2)}')
print(f'- MSE: {round(model_mse, 2)}')


RANDOM FOREST REGRESSOR METRICS
-------------------------------

- R2: 0.44
- MAE: 4.49
- MSE: 82.74


## CONCLUSION
-------------

### THE RESULTS ARE NOT THE BEST, BUT ARE NOT BAD TOO. IT WOULD BE INTERESTING TO CREATE A DOUBLE STEP ALGORITHM TO CHECK FIRST IF THE USER IS LIKELY TO DONATE OR NOT, AND IF IT IS, HOW MUCH QUANTITY IS GOING TO DONATE.