In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/drug-classification/drug200.csv')

# Exploratory Data Analysis

With .head() we see that there are ordinal categorical features which we can encode using sklearn's LabelEncoder

In [None]:
df.head()

We use .value_counts() to find the distribution of values that our categorical features take on. We can then use sklearn's LabelEncoder to encode all 'object' type features.

In [None]:
le = LabelEncoder()
for i in list(df.columns):
    if df[i].dtype=='object':
        df[i] = le.fit_transform(df[i])
df.head()

Using .info() we see that there are 200 non-null values so there is no need to fill in missing values

In [None]:
df.info()

From the heatmap below we see that Na_to_K has a very low correlation with the final drug prediction

In [None]:
sns.heatmap(df.corr())

Further investigating the pairplot below, we see that Na_to_K is heavily right skewed, so we will apply a log transformation to scale the values after creating a Random Forest model

In [None]:
sns.pairplot(df, hue='Drug')

In [None]:
X = df.loc[:, df.columns != 'Drug']
y = df.loc[:, 'Drug']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model1 = RandomForestRegressor().set_params(random_state=23)
model1.fit(X_train, y_train)
preds1 = model1.predict(X_test)

accuracy_score(y_test, preds1.astype(int))

Before we fit a Logistic Regression model, we scale the Na_to_K column given that it is heavily right skewed

In [None]:
df['log_Na_to_K'] = np.log(df['Na_to_K'] + 1)
df.drop('Na_to_K', axis=1, inplace=True)
sns.pairplot(df, hue='Drug')

In [None]:
X = df.loc[:, df.columns != 'Drug']
y = df.loc[:, 'Drug']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model2 = LogisticRegression(max_iter=10000)
model2.fit(X_train, y_train)
preds2 = model2.predict(X_test)

accuracy_score(y_test, preds2.astype(int))

We can do better if we use the Random Forest Classifier given that this is ultimately a classification problem - we end up with 100% accuracy

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
pred_2 = rfc.predict(X_test)
score_2 = accuracy_score(y_test,pred_2.astype(int))

In [None]:
score_2