## IMPORTING THE REQUIRED LIBRARIES

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## IGNORING THE WARNINGS FROM THE PANDAS LIBRARY

In [2]:
from warnings import simplefilter

simplefilter(action = 'ignore', category = Warning)

## IMPORTING THE DATA

In [3]:
numerical = pd.read_csv('files_for_lab/numerical.csv')
categorical = pd.read_csv('files_for_lab/categorical.csv')
target = pd.read_csv('files_for_lab/target.csv')['TARGET_B']

## CONVERTING THE CATEGORICAL VALUES IN REAL OBJECTS

In [4]:
categorical = categorical.astype('object')

## MERGING THE NUMERICAL, CATEGORICAL AND TARGET DATAFRAMES IN A UNIQUE ONE

In [5]:
customer_churn = pd.concat([numerical, categorical, target], axis = 1)

## ENCODING ALL THE CATEGORICAL COLUMNS INTO NUMBERS THAT GOES FROM 0 TO THE LENGTH OF THE UNIQUE VALUES

In [6]:
categorical_columns = customer_churn.select_dtypes(object).columns.tolist()

for column in categorical_columns:
    encoder = LabelEncoder().fit(customer_churn[column])
    customer_churn[column] = encoder.transform(customer_churn[column])

## PERFORMING THE X-Y TRAIN TEST SPLIT IN THE CUSTOMER DATAFRAME

In [7]:
X = customer_churn.drop('TARGET_B', axis = 1)
y = customer_churn['TARGET_B']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

## WITH RANDOM FORESTS IS NOT NECESSARY, BUT LET'S SCALE THE DATA

In [8]:
numerical_columns = X_train.select_dtypes(include = np.number).columns.tolist()

scaler = StandardScaler().fit(X_train[numerical_columns])

X_train[numerical_columns] = scaler.transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

## OVERSAMPLING THE MINORITY CLASS USING THE SMOTE ALGORITHM

In [9]:
def over_sampling(training_x, training_y):

    smote = SMOTE(random_state = 100, k_neighbors = 3)
    X_train_SMOTE, y_train_SMOTE = smote.fit_resample(training_x, training_y)

    return X_train_SMOTE, y_train_SMOTE    

X_over, y_over = over_sampling(X_train, y_train)

## TRAINING A RANDOM FOREST CLASSIFIER USING THE CROSS VALIDATION PROCESS

In [10]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_over, y_over)

RandomForestClassifier()

## GETTING AND EVALUATING THE METRICS IN THE VALIDATION DATA

In [11]:
def model_metrics(model, testing_x, testing_y):

    predictions = model.predict(testing_x)

    accuracy = accuracy_score(testing_y, predictions)
    precision = precision_score(testing_y, predictions)
    recall = recall_score(testing_y, predictions)
    f1 = f1_score(testing_y, predictions)

    return accuracy, precision, recall, f1

model_accuracy, model_precision, model_recall, model_f1 = model_metrics(rf_clf, X_test, y_test)

print('\nRANDOM FOREST METRICS - OVERSAMPLING')
print('------------------------------------\n')

print(f'- Accuracy: {round(model_accuracy, 2)}')
print(f'- Precision: {round(model_precision, 2)}')
print(f'- Recall: {round(model_recall, 2)}')
print(f'- F1: {round(model_f1, 2)}')


RANDOM FOREST METRICS - OVERSAMPLING
------------------------------------

- Accuracy: 0.95
- Precision: 0.2
- Recall: 0.0
- F1: 0.0
