In [1]:
import pandas as pd

# Read the CSV file with the first row as column names
dis_text = pd.read_csv('/content/train_final.csv')

# Now 'dis_text' DataFrame has the first row as column names
print(dis_text.head(10))


                                                text label
0  Prevent Malaria Attack by using Mosquito NetKe...     1
1  There's a malaria vaccine. The only approved v...     1
2  Hello doctor please how safe are fansider and ...     0
3  Much later in the future, it would be released...     1
4  signs of stroke...let her see a good doctor..r...     1
5          Possibly! It will also do same to malaria     1
6  go to my previous post where infolekan quoted ...     1
7  Bros, I no get malaria but I go like know the ...     1
8  Poor man suffered a fatal stroke. His face is ...     0
9  There are many causes to that but to mention b...     1


In [2]:
column_names = dis_text.columns

print("Column names:")
for col in column_names:
    print(col)

Column names:
text
label


In [3]:
has_nan_values = dis_text.isna().any(axis=1)

# Select rows with at least one NaN value
rows_with_nan = dis_text[has_nan_values]

print("Rows with NaN values:")
print(rows_with_nan)


Rows with NaN values:
     text label
2596  NaN     1


In [4]:
dis_text_clean=dis_text.dropna()
print(dis_text_clean.isnull().values.any())

False


In [9]:
from sklearn.model_selection import train_test_split
X = dis_text_clean.text
y = dis_text_clean.label
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 1)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from joblib import dump, load

# Define the model pipeline with Decision Tree Classifier
model_pipeline_dt = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dt', DecisionTreeClassifier())
])

# Train the model
model_pipeline_dt.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = model_pipeline_dt.predict(X_test)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, average='weighted')
recall_dt = recall_score(y_test, y_pred_dt, average='weighted')
f1_score_dt = f1_score(y_test, y_pred_dt, average='weighted')
classification_report_dt = classification_report(y_test, y_pred_dt)

# Print evaluation metrics
print("Evaluation Metrics for Decision Tree Model")
print("------------------------------------------")
print(classification_report_dt)

Evaluation Metrics for Decision Tree Model
------------------------------------------
              precision    recall  f1-score   support

           0       0.46      0.42      0.44       184
           1       0.75      0.78      0.77       574
           2       0.52      0.49      0.51       146

    accuracy                           0.66       904
   macro avg       0.58      0.56      0.57       904
weighted avg       0.65      0.66      0.66       904



In [12]:
#to save the model
from joblib import dump
dump(model_pipeline_dt, 'medication-classification_tf-idf_decision_tree.pkl')

['medication-classification_tf-idf_decision_tree.pkl']