In [1]:
# To access preprocessy module. Required in .ipynb files
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [32]:
import numpy as np
import pandas as pd
import time
from sklearn.datasets import load_iris, load_boston, load_breast_cancer, load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, r2_score
from preprocessy.feature_selection import Correlation, SelectKBest
from preprocessy.resampling import Split

### Method to print the correlation statistics for the given dataset

In [23]:
def eval(X, threshold=0.8):
    corr = Correlation()
    for col1, col2, value, sign in corr.find(X,threshold):
        print(f'{col1} x {col2}\nCorrelation - {value:.2f}\nType - {sign}\n\n')

# Breast Cancer Dataset

The Breast Cancer dataset comprises of `569 records` and `30 features`. Some these features are highly correlated to each other. First we will list down a few of those correlations with `values > 0.97`. The goal is to compare the results before and after dropping the highly correlated columns indicated by the `Correlation` class from `preprocessy.feature_selection` module and keeping all the other preprocessing thresholds the same. We will compare the accuracy and time taken to get the results. We use `Split` class from `preprocessy.resampling` module to perform the train test split.

In [88]:
print("Dataset - Breast Cancer")
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
eval(X,threshold=0.97)

Dataset - Breast Cancer
mean radius x mean perimeter
Correlation - 1.00
Type - Positive Correlation


mean radius x mean area
Correlation - 0.99
Type - Positive Correlation


mean perimeter x mean area
Correlation - 0.99
Type - Positive Correlation


mean perimeter x worst perimeter
Correlation - 0.97
Type - Positive Correlation


radius error x perimeter error
Correlation - 0.97
Type - Positive Correlation


worst radius x worst perimeter
Correlation - 0.99
Type - Positive Correlation


worst radius x worst area
Correlation - 0.98
Type - Positive Correlation


worst perimeter x worst area
Correlation - 0.98
Type - Positive Correlation




### Building a model directly

We now train a random forest classifier on the dataset without dropping any correlated columns.

In [89]:
start = time.time()
model = RandomForestClassifier()
X_train, X_test, y_train, y_test = Split().train_test_split(X, y, test_size=0.1)
model.fit(X_train, y_train)
preds = model.predict(X_test)
accuracy_1 = classification_report(y_test,preds,output_dict=True)["accuracy"]
print(f'Accuracy - {accuracy_1}')
print(f'Time taken - {(time.time()-start):4f}')

Accuracy - 1.0
Time taken - 0.208392


### Building model post preprocessing

We now drop some of the columns that are correlated with `mean radius`, `worst radius` and `radius error`

In [90]:
X.drop(['mean area','mean perimeter','worst area','worst perimeter','perimeter error','area error'],axis=1,inplace=True)

In [91]:
start = time.time()
model = RandomForestClassifier()
X_train, X_test, y_train, y_test = Split().train_test_split(X, y, test_size=0.1)
model.fit(X_train, y_train)
preds = model.predict(X_test)
accuracy_2 = classification_report(y_test,preds,output_dict=True)["accuracy"]
print(f'Accuracy - {accuracy_2}')
print(f'Time taken - {(time.time()-start):4f}')

Accuracy - 1.0
Time taken - 0.192731


## Conclusion

For this particular dataset and thresholds, the accuracy of both approaches is `100%` but the time consumed after dropping the correlated columns is slightly less than without dropping them.

# Iris Dataset

The iris dataset consists of `150 records` and `4 features`. As the number of features is less, removing correlated columns will not be helpful.

In [None]:
print("Dataset - Iris")
X,y = load_iris(return_X_y=True,as_frame=True)
eval(X)

# Boston Housing Dataset

The boston housing dataset consists of `506 records` and `13 features`. We will apply the same comparison test as before but with a threshold of 0.7 this time.

In [159]:
print("Dataset - Boston")
dataset = load_boston()
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.Series(dataset.target, name="Target")
eval(X,threshold=0.7)

Dataset - Boston
INDUS x NOX
Correlation - 0.76
Type - Positive Correlation


INDUS x DIS
Correlation - -0.71
Type - Negative Correlation


INDUS x TAX
Correlation - 0.72
Type - Positive Correlation


NOX x AGE
Correlation - 0.73
Type - Positive Correlation


NOX x DIS
Correlation - -0.77
Type - Negative Correlation


AGE x DIS
Correlation - -0.75
Type - Negative Correlation


RAD x TAX
Correlation - 0.91
Type - Positive Correlation




### Building a model directly

We now train a linear regression model on the dataset without dropping any correlated columns.

In [160]:
start = time.time()
model = LinearRegression(fit_intercept=True, normalize=True, copy_X=True)
X_train, X_test, y_train, y_test = Split().train_test_split(X, y)
model.fit(X_train, y_train)
preds = model.predict(X_test)
accuracy_1 = r2_score(y_test, preds)
print(f'Accuracy - {accuracy_1}')
print(f'Time taken - {(time.time()-start):4f}')

Accuracy - 0.6891093866231174
Time taken - 0.011102


### Building model post preprocessing

We now drop some of the columns that are correlated with `TAX`, `NOX` and `DIS`

In [161]:
X.drop(['TAX','NOX','DIS'],axis=1,inplace=True)

In [162]:
start = time.time()
model = LinearRegression(fit_intercept=True, normalize=True, copy_X=True)
X_train, X_test, y_train, y_test = Split().train_test_split(X, y)
model.fit(X_train, y_train)
preds = model.predict(X_test)
accuracy_2 = r2_score(y_test, preds)
print(f'Accuracy - {accuracy_2}')
print(f'Time taken - {(time.time()-start):4f}')

Accuracy - 0.6933951730354478
Time taken - 0.010524


## Conclusion

For this particular dataset and thresholds, the accuracy improves slightly after dropping the correlated columns and the time consumed is slightly less than without dropping them.

# Boston Housing Dataset

The boston housing dataset consists of `442 records` and `10 features`. We will apply the same comparison test as before but with a threshold of 0.7 this time.

In [200]:
print(f"Dataset - Diabetes")
X, y = load_diabetes(return_X_y=True, as_frame=True)
eval(X,threshold=0.7)

Dataset - Diabetes
s1 x s2
Correlation - 0.90
Type - Positive Correlation


s3 x s4
Correlation - -0.74
Type - Negative Correlation




### Building a model directly

We now train a linear regression model on the dataset without dropping any correlated columns.

In [201]:
start = time.time()
model = LinearRegression(fit_intercept=True, normalize=True, copy_X=True)
X_train, X_test, y_train, y_test = Split().train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train)
preds = model.predict(X_test)
accuracy_1 = r2_score(y_test, preds)
print(f'Accuracy - {accuracy_1}')
print(f'Time taken - {(time.time()-start):4f}')

Accuracy - 0.5065368093248677
Time taken - 0.009568


### Building model post preprocessing

We now drop `s2` column.

In [202]:
X.drop(['s2'],axis=1,inplace=True)

In [203]:
start = time.time()
model = LinearRegression(fit_intercept=True, normalize=True, copy_X=True)
X_train, X_test, y_train, y_test = Split().train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train)
preds = model.predict(X_test)
accuracy_1 = r2_score(y_test, preds)
print(f'Accuracy - {accuracy_1}')
print(f'Time taken - {(time.time()-start):4f}')

Accuracy - 0.5055602962093787
Time taken - 0.009409


## Conclusion

For this particular dataset and thresholds, the accuracy decreases slightly after dropping the correlated columns and the time consumed is slightly less than without dropping them.