In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [23]:
df = pd.read_csv('large_data.csv')
df['TYPE'] = df['TYPE'].astype('category')

### Cost-sensitive resampling

In [24]:
covid_df = df.loc[df['TYPE'] == 'COVID']
flu_df = df.loc[df['TYPE'] == 'FLU']
allergy_df = df.loc[df['TYPE'] == 'ALLERGY']
cold_df = df.loc[df['TYPE'] == 'COLD']

In [13]:
print("covid cases : ", covid_df.shape[0])
print("cold cases : ", cold_df.shape[0])
print("allergy cases : ", allergy_df.shape[0])
print("flu cases : ", flu_df.shape[0])

covid cases :  2048
cold cases :  1024
allergy cases :  16381
flu cases :  25000


#### Under-sampling

In [14]:
covid_df = covid_df.sample(n=1024, random_state=1)
flu_df = flu_df.sample(n=1024, random_state=1)
allergy_df = allergy_df.sample(n=1024, random_state=1)

undersampled_df = pd.concat([cold_df, covid_df, allergy_df, flu_df])

#### Over-sampling

In [25]:
covid_df = covid_df.sample(n=25000, replace=True, random_state=1)
cold_df = cold_df.sample(n=25000, replace=True, random_state=1)
allergy_df = allergy_df.sample(n=25000, replace=True, random_state=1)

oversampled_df = pd.concat([cold_df, covid_df, allergy_df, flu_df])

#### Train Test Split

In [26]:
input_names = list(oversampled_df.columns)
input_names.remove('TYPE')

all_inputs = oversampled_df[input_names].values
all_labels = oversampled_df['TYPE'].values

In [27]:
(training_inputs,
 testing_inputs,
 training_classes,
 testing_classes) = train_test_split(all_inputs, all_labels, test_size=0.25, random_state=1, stratify=all_labels)

### Cost-sensitive algorithms
The scikit-learn Python machine learning library provides examples of these cost-sensitive extensions via the class_weight argument on the following classifiers:
- SVC
- DecisionTreeClassifier

The scikit-learn library provides examples of these cost-sensitive extensions via the class_weight argument on the following classifiers:
- LogisticRegression
- RidgeClassifier

In [28]:
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(training_inputs, training_classes)
decision_tree_classifier.score(testing_inputs, testing_classes)

0.95968

In [46]:
decision_tree_classifier2 = DecisionTreeClassifier(class_weight={'ALLERGY':1, 'FLU':1, 'COLD':2, 'COVID':1})
decision_tree_classifier2.fit(training_inputs, training_classes)
decision_tree_classifier2.score(testing_inputs, testing_classes)

0.95968

In [30]:
predictions = decision_tree_classifier.predict(testing_inputs)
print(classification_report(testing_classes, predictions))

              precision    recall  f1-score   support

     ALLERGY       1.00      0.97      0.98      6250
        COLD       0.92      1.00      0.96      6250
       COVID       0.92      0.97      0.95      6250
         FLU       1.00      0.91      0.95      6250

    accuracy                           0.96     25000
   macro avg       0.96      0.96      0.96     25000
weighted avg       0.96      0.96      0.96     25000



In [47]:
predictions2 = decision_tree_classifier2.predict(testing_inputs)
print(classification_report(testing_classes, predictions2))

              precision    recall  f1-score   support

     ALLERGY       1.00      0.97      0.98      6250
        COLD       0.92      1.00      0.96      6250
       COVID       0.92      0.97      0.95      6250
         FLU       1.00      0.91      0.95      6250

    accuracy                           0.96     25000
   macro avg       0.96      0.96      0.96     25000
weighted avg       0.96      0.96      0.96     25000



In [16]:
parameter_grid = {'criterion': ['gini'], #gini, entropy
                  'splitter': ['random'], #best, random
                  'max_depth': [13, 14, 15, 16, 17],
                  'max_features': [14, 15, 16, 17]}

grid_search = GridSearchCV(DecisionTreeClassifier(),
                        param_grid=parameter_grid,
                        cv=10,
                        verbose=3)

grid_search.fit(all_inputs, all_labels)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))
print('Best estimator: {}'.format(grid_search.best_estimator_))

Fitting 10 folds for each of 20 candidates, totalling 200 fits
[CV 1/10] END criterion=gini, max_depth=13, max_features=14, splitter=random; total time=   0.2s
[CV 2/10] END criterion=gini, max_depth=13, max_features=14, splitter=random; total time=   0.2s
[CV 3/10] END criterion=gini, max_depth=13, max_features=14, splitter=random; total time=   0.2s
[CV 4/10] END criterion=gini, max_depth=13, max_features=14, splitter=random; total time=   0.2s
[CV 5/10] END criterion=gini, max_depth=13, max_features=14, splitter=random; total time=   0.2s
[CV 6/10] END criterion=gini, max_depth=13, max_features=14, splitter=random; total time=   0.2s
[CV 7/10] END criterion=gini, max_depth=13, max_features=14, splitter=random; total time=   0.2s
[CV 8/10] END criterion=gini, max_depth=13, max_features=14, splitter=random; total time=   0.2s
[CV 9/10] END criterion=gini, max_depth=13, max_features=14, splitter=random; total time=   0.2s
[CV 10/10] END criterion=gini, max_depth=13, max_features=14, sp

In [18]:
# For undersmapled: gini, 11, 15, random
# improved_clf = DecisionTreeClassifier(criterion='gini', max_depth=11, max_features=15, splitter='random')
# For oversmapled: gini, 14, 13, random
improved_clf = DecisionTreeClassifier(criterion='gini', max_depth=14, max_features=17, splitter='random')

# Train the classifier on the training set
improved_clf.fit(training_inputs, training_classes)
new_predictions = improved_clf.predict(testing_inputs)
print(classification_report(testing_classes, new_predictions))

              precision    recall  f1-score   support

     ALLERGY       1.00      0.97      0.98      6250
        COLD       0.92      1.00      0.96      6250
       COVID       0.93      0.97      0.95      6250
         FLU       1.00      0.91      0.95      6250

    accuracy                           0.96     25000
   macro avg       0.96      0.96      0.96     25000
weighted avg       0.96      0.96      0.96     25000

