# Importing required libraries

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import joblib

# Loading and Training the dataset

In [34]:
##Step1: Load Dataset
dataframe = pd.read_csv("malaria_dataset.csv")
print(dataframe.head())

         Label  area_0  area_1   area_2  area_3  area_4
0  Parasitized   175.5   126.0    131.0  8902.5     0.0
1  Parasitized   222.0  9847.5      0.0     0.0     0.0
2  Parasitized   179.5   256.5  12413.0     0.0     0.0
3  Parasitized    18.0   187.5   9306.5     0.0     0.0
4  Parasitized   142.5   156.5   6669.5     0.0     0.0


In [35]:
##Step2: Split into training and test data
x = dataframe.drop(["Label"],axis=1)
y = dataframe["Label"]
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

# Building models using various classifiers

In [54]:
# Implementing Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
decTree_model = DecisionTreeClassifier()

# Train Decision Tree Classifer
decTree_model.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [53]:
decTree_predictions = decTree_model.predict(x_test)

print(metrics.classification_report(decTree_predictions, y_test))
print(decTree_model.score(x_test, y_test))

              precision    recall  f1-score   support

 Parasitized       0.85      0.85      0.85      2803
  Uninfected       0.84      0.84      0.84      2709

    accuracy                           0.84      5512
   macro avg       0.84      0.84      0.84      5512
weighted avg       0.84      0.84      0.84      5512

0.8443396226415094


In [55]:
# Implementing support vector machine classifier
from sklearn.svm import SVC
svm_model = SVC()

# Train the model
svm_model.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [56]:
svm_predictions = svm_model.predict(x_test)

print(metrics.classification_report(svm_predictions, y_test))
print(svm_model.score(x_test, y_test))

              precision    recall  f1-score   support

 Parasitized       0.89      0.90      0.90      2770
  Uninfected       0.90      0.89      0.90      2742

    accuracy                           0.90      5512
   macro avg       0.90      0.90      0.90      5512
weighted avg       0.90      0.90      0.90      5512

0.8978592162554426


In [57]:
# Implementing Logistic Regression 
from sklearn.linear_model import LogisticRegression
logreg_model = LogisticRegression()

# Train the model
logreg_model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [58]:
logreg_predictions = logreg_model.predict(x_test)

print(metrics.classification_report(logreg_predictions, y_test))
print(logreg_model.score(x_test, y_test))

              precision    recall  f1-score   support

 Parasitized       0.90      0.90      0.90      2786
  Uninfected       0.90      0.90      0.90      2726

    accuracy                           0.90      5512
   macro avg       0.90      0.90      0.90      5512
weighted avg       0.90      0.90      0.90      5512

0.89822206095791


In [39]:
# Implementing Ada Boost Classifier
from sklearn.ensemble import AdaBoostClassifier
abc_model = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
# Train the model
abc_model.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
                   n_estimators=50, random_state=None)

In [40]:
abc_predictions = abc_model.predict(x_test)

print(metrics.classification_report(abc_predictions,y_test))
print(abc_model.score(x_test, y_test))
cnf_matrix = metrics.confusion_matrix(y_test, abc_predictions)

              precision    recall  f1-score   support

 Parasitized       0.89      0.91      0.90      2743
  Uninfected       0.91      0.89      0.90      2769

    accuracy                           0.90      5512
   macro avg       0.90      0.90      0.90      5512
weighted avg       0.90      0.90      0.90      5512

0.8994920174165457


In [41]:
# Implementing Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc_model = RandomForestClassifier(n_estimators=100,max_depth=5)

# Train the model
rfc_model.fit(x_train, y_train)
joblib.dump(model,"rf_malaria_100_5")       # to save the model for regular use

['rf_malaria_100_5']

In [43]:
# Predictions using rfc_model
rfc_predictions = model.predict(x_test)

print(metrics.classification_report(rfc_predictions, y_test))
print(rfc_model.score(x_test, y_test))

              precision    recall  f1-score   support

 Parasitized       0.90      0.90      0.90      2771
  Uninfected       0.90      0.89      0.90      2741

    accuracy                           0.90      5512
   macro avg       0.90      0.90      0.90      5512
weighted avg       0.90      0.90      0.90      5512

0.898766328011611


In [67]:
summary = {"Classifying_Model" : ["Decision Tree", "Support Vector Machine", "Logistic Regression", "Ada Boost", "Random Forest"], 
          "Accuracy" : [decTree_model.score(x_test, y_test), svm_model.score(x_test, y_test), logreg_model.score(x_test, y_test),
           abc_model.score(x_test, y_test), rfc_model.score(x_test, y_test)]}
result = pd.DataFrame(summary)
print(result)b

        Classifying_Model  Accuracy
0           Decision Tree  0.842888
1  Support Vector Machine  0.897859
2     Logistic Regression  0.898222
3               Ada Boost  0.899492
4           Random Forest  0.898766


# Summarizing the results

In [68]:
max_score = max(result.Accuracy)
print("Most Efficient Classifying algorithm is", result.Classifying_Model[result.Accuracy == max_score], "with an accuracy of",
     max_score)

Most Efficient Classifying algorithm is 3    Ada Boost
Name: Classifying_Model, dtype: object with an accuracy of 0.8994920174165457
