In [1]:
import pandas as pd

# Read the CSV file with the first row as column names
dis_text = pd.read_csv('/content/train_final.csv')

# Now 'dis_text' DataFrame has the first row as column names
print(dis_text.head(10))


                                                text label
0  Prevent Malaria Attack by using Mosquito NetKe...     1
1  There's a malaria vaccine. The only approved v...     1
2  Hello doctor please how safe are fansider and ...     0
3  Much later in the future, it would be released...     1
4  signs of stroke...let her see a good doctor..r...     1
5          Possibly! It will also do same to malaria     1
6  go to my previous post where infolekan quoted ...     1
7  Bros, I no get malaria but I go like know the ...     1
8  Poor man suffered a fatal stroke. His face is ...     0
9  There are many causes to that but to mention b...     1


In [2]:
column_names = dis_text.columns

print("Column names:")
for col in column_names:
    print(col)

Column names:
text
label


In [3]:

num_rows, num_columns = dis_text.shape

print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")


Number of rows: 4521
Number of columns: 2


In [4]:

has_nan_values = dis_text.isna().any(axis=1)

# Select rows with at least one NaN value
rows_with_nan = dis_text[has_nan_values]

print("Rows with NaN values:")
print(rows_with_nan)


Rows with NaN values:
     text label
2596  NaN     1


In [5]:
dis_text_clean=dis_text.dropna()
print(dis_text_clean.head(10))

                                                text label
0  Prevent Malaria Attack by using Mosquito NetKe...     1
1  There's a malaria vaccine. The only approved v...     1
2  Hello doctor please how safe are fansider and ...     0
3  Much later in the future, it would be released...     1
4  signs of stroke...let her see a good doctor..r...     1
5          Possibly! It will also do same to malaria     1
6  go to my previous post where infolekan quoted ...     1
7  Bros, I no get malaria but I go like know the ...     1
8  Poor man suffered a fatal stroke. His face is ...     0
9  There are many causes to that but to mention b...     1


In [6]:
print(dis_text_clean.isnull().values.any())

False


In [7]:
dis_text_clean.label.unique()

array(['1', '0', '2', 'label'], dtype=object)

In [8]:

data_type = dis_text_clean['label'].dtype

print(f"Data type of 'label': {data_type}")


Data type of 'label': object


In [12]:
from sklearn.model_selection import train_test_split
X = dis_text_clean.text
y = dis_text_clean.label
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.6, random_state = 1)

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Define the model pipeline with Logistic Regression
model_pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

# Train the model
model_pipeline_lr.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = model_pipeline_lr.predict(X_test)

# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_score_lr = f1_score(y_test, y_pred_lr, average='weighted')
classification_report_lr = classification_report(y_test, y_pred_lr)

# Print evaluation metrics
print("Evaluation Metrics for Logistic Regression Model")
print("------------------------------------------------")
print(classification_report_lr)


Evaluation Metrics for Logistic Regression Model
------------------------------------------------
              precision    recall  f1-score   support

           0       0.73      0.47      0.57       368
           1       0.75      0.91      0.82      1171
           2       0.60      0.33      0.43       269

    accuracy                           0.73      1808
   macro avg       0.69      0.57      0.61      1808
weighted avg       0.72      0.73      0.71      1808



In [14]:
#to save the model
from joblib import dump
dump(model_pipeline_lr, 'medication-classification_tf-idf.pkl')

['medication-classification_tf-idf.pkl']