In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("anemia.csv")
df.head()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
0,1,14.9,22.7,29.1,83.7,0
1,0,15.9,25.4,28.3,72.0,0
2,0,9.0,21.5,29.6,71.2,1
3,0,14.9,16.0,31.4,87.5,0
4,1,14.7,22.0,28.2,99.5,0


In [None]:
df['Result'].replace(float('0'), 'no', inplace=True)
df['Result'].replace(float('1'), 'yes', inplace=True)
df['Gender'].replace(float('0'), 'male', inplace=True)
df['Gender'].replace(float('1'), 'female', inplace=True)

In [None]:
df.loc[(df['Gender'] == 'male') & (df['Hemoglobin'] <= 13.2), "status"] = "anemia"
df.loc[(df['Gender'] == 'female') & (df['Hemoglobin'] <= 11.6), "status"] = "anemia"
df.loc[(df['Gender'] == 'male') & (df['Hemoglobin'] >= 13.2), "status"] = "normal"
df.loc[(df['Gender'] == 'female') & (df['Hemoglobin'] >= 11.6), "status"] = "normal"
df.head()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result,status
0,female,14.9,22.7,29.1,83.7,no,normal
1,male,15.9,25.4,28.3,72.0,no,normal
2,male,9.0,21.5,29.6,71.2,yes,anemia
3,male,14.9,16.0,31.4,87.5,no,normal
4,female,14.7,22.0,28.2,99.5,no,normal


In [None]:
df.to_csv('anemiadata.csv', index=False)

In [None]:
# See the min, max, mean values
print('The highest hemoglobin was of:',df['Hemoglobin'].max())
print('The lowest hemoglobin was of:',df['Hemoglobin'].min())
print('The average hemoglobin in the data:',df['Hemoglobin'].mean())

The highest hemoglobin was of: 16.9
The lowest hemoglobin was of: 6.6
The average hemoglobin in the data: 13.412737508796623


In [None]:
from sklearn.ensemble import RandomForestClassifier
import multiprocessing 
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("anemiadata.csv")
data.head()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result,status
0,female,14.9,22.7,29.1,83.7,no,normal
1,male,15.9,25.4,28.3,72.0,no,normal
2,male,9.0,21.5,29.6,71.2,yes,anemia
3,male,14.9,16.0,31.4,87.5,no,normal
4,female,14.7,22.0,28.2,99.5,no,normal


In [None]:
data['Hemoglobin'] = data['Hemoglobin'].astype(str)
data['Gender'] = data['Gender'].astype(str)
data['Result'] = data['Result'].astype(str)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421 entries, 0 to 1420
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      1421 non-null   object 
 1   Hemoglobin  1421 non-null   object 
 2   MCH         1421 non-null   float64
 3   MCHC        1421 non-null   float64
 4   MCV         1421 non-null   float64
 5   Result      1421 non-null   object 
 6   status      1421 non-null   object 
dtypes: float64(3), object(4)
memory usage: 77.8+ KB


In [None]:
data['status'].value_counts()

normal    957
anemia    464
Name: status, dtype: int64

In [None]:
results = []

def processData(data, outcome_filter):  
    print(outcome_filter)
    df_filter=data[data['status']==outcome_filter]
    df_filter.drop(['status'], axis=1, inplace=True)
    data_new=df_filter
    clf=RandomForestClassifier(n_estimators=100)
    X=data_new.drop('Result', axis=1)  # Features
    y=data_new['Result']  # Labels

    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    clf.fit(X_train,y_train)
    y_out = clf.predict(X_test)
    X_test['status']=outcome_filter
    cols=list(X_test.columns) + ['Actual']
    out=pd.concat([X_test, y_test], axis=1, ignore_index=True)
    out.columns=cols
    out.reset_index(drop=True, inplace=True)
    final_out=pd.concat([out, pd.DataFrame(y_out)], axis=1, ignore_index=True)
    final_out.columns=list(out.columns)+['Prediction']
    print(final_out.shape)
    return final_out

def collect_results(result):
    results.extend(result.values.tolist())

In [None]:
multiprocessing.cpu_count()

2

In [None]:
if __name__ == "__main__":
    start_time = time.time()  
    
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()-1)
    for i in data['status'].unique(): 
        pool.apply_async(processData, args=(data.drop(['Hemoglobin', 'Gender'], axis=1), i), callback=collect_results)
    pool.close()
    pool.join()
    
    # Converts list of lists to a data frame
    dataFrame = pd.DataFrame(results, columns=list(data.drop(['Hemoglobin', 'Gender', 'Result'], axis=1).columns) + ['Actual', 'Prediction'])
    print("Dimensions in final test data {}".format(dataFrame.shape))
    print("--- %s seconds ---" % (time.time() - start_time))

normal
(96, 6)
anemia
(47, 6)
Dimensions in final test data (143, 6)
--- 0.49872660636901855 seconds ---


In [None]:
dataFrame[100:143]

Unnamed: 0,MCH,MCHC,MCV,status,Actual,Prediction
100,27.9,31.3,86.4,anemia,yes,yes
101,23.0,30.3,87.7,anemia,yes,yes
102,18.1,31.5,97.1,anemia,yes,yes
103,20.0,29.7,89.7,anemia,no,no
104,20.0,29.1,79.3,anemia,yes,yes
105,23.3,30.4,100.2,anemia,no,no
106,25.2,30.9,83.2,anemia,yes,yes
107,18.8,29.3,79.1,anemia,yes,yes
108,23.5,28.4,75.8,anemia,yes,yes
109,23.0,29.3,85.8,anemia,no,no
