In [None]:
print("Hello World! This is my first submission. Let's begin!")

# Importing Dataset and Libraries 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [None]:
dataset = pd.read_csv('/kaggle/input/drug-classification/drug200.csv')
dataset

* The dataset has 5 features (Age, Sex, BP, Cholesterol, Na_to_K) and 1 label (Drug)
* The dataset only has 200 rows

In [None]:
dataset.dtypes

4 columns will require categorical encodoing before fitting a model to the data

In [None]:
dataset.isnull().sum()

No null values are present in the dataset

# Visualizing the data

In [None]:
plt.figure(figsize=(8,6))
dcount = sns.countplot('Drug', data=dataset)

* The target has 5 classes (DrugY, drugC, drugX, drugA, drugB).
* DrugY is used the most.

In [None]:
plt.figure(figsize=(8,6))
dcount = sns.countplot('Drug', hue='Sex', data=dataset)

In [None]:
plt.figure(figsize=(8,6))
ax = sns.boxplot('Sex', 'Age', data=dataset).set(ylim=(0, 80))

In [None]:
sexcnt = sns.countplot('Sex', data=dataset).set_yticks([i*10 for i in range(12)])

* The gender of the individual is not a big deciding factor in the drug usage (i.e. there is no gender specific drug)
* The dataset has an almost equal distribution of males and females

In [None]:
agevdrug = sns.catplot('Drug', 'Age', data=dataset)

* DrugY is used the most
* There seems to an age limit for the usage of drugB as it is only used by individuals above 50.
* Similarly drugA seems to be only used by individuals below 50

In [None]:
fig, ax = plt.subplots(1,2, figsize=(18, 7))
agevbp = sns.violinplot('BP', 'Age', data=dataset, hue = 'Sex', ax = ax[0]).set(ylim=(0,100))
agevch = sns.violinplot('Cholesterol', 'Age', data=dataset, hue = 'Sex', ax = ax[1]).set(ylim=(0,100))

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(22,5))
hbp = sns.countplot(dataset[dataset['BP'] == 'HIGH'].iloc[:, -1], ax=ax[0],order=list(dataset['Drug'].unique())).set(ylim=(0,40), xlabel='HIGH BP')
nbp = sns.countplot(dataset[dataset['BP'] == 'NORMAL'].iloc[:, -1], ax=ax[1],order=list(dataset['Drug'].unique())).set(ylim=(0,40), xlabel='NORMAL BP')
lbp = sns.countplot(dataset[dataset['BP'] == 'LOW'].iloc[:, -1], ax=ax[2],order=list(dataset['Drug'].unique())).set(ylim=(0,40), xlabel='LOW BP')

* DrugY is used for all BP levels
* drugC is used only if the BP levels are low
* drugB and drugA are used only if the BP level is high
* drugX can be used for normal or low BP levels.

In [None]:
plt.figure(figsize=(8,6))
chcount = sns.countplot('Drug', hue='Cholesterol', data=dataset)

* drugC isn't used if the cholesterol levels are normal.

In [None]:
plt.figure(figsize=(8,6))
nakvsbp = sns.boxplot('BP', 'Na_to_K', data = dataset).set(ylim=(0,40))

In [None]:
nakvsdrug = sns.catplot('Drug', 'Na_to_K', data=dataset)

* There DrugY is used only if the Na_to_K ratio is above 15.

# Preprocessing the dataset

### Separating the features and the labels

In [None]:
X = dataset.iloc[: :-1]
y = dataset.iloc[:, -1].values

### Encoding categorical features and scaling numerical features

In [None]:
ct = ColumnTransformer([
    ("onehot", OneHotEncoder(), [1]), 
    ("ordBP", OrdinalEncoder(categories = [['HIGH', 'NORMAL', 'LOW']]), [2]),
    ("ordChol", OrdinalEncoder(categories = [['HIGH', 'NORMAL']]), [3]),
    ("stdscl", StandardScaler(), [0, 4])], n_jobs = -1)
X = ct.fit_transform(X)[::-1]

### Encoding the labels

In [None]:
le = LabelEncoder().fit(y)
le.classes_

In [None]:
y = le.transform(y)

# Classification using SVM with K-Fold Cross Validation

In [None]:
model = SVC()

num_splits = 5
kfold = KFold(num_splits)

In [None]:
train_accs, test_accs = [], []
for train_index, test_index in kfold.split(X):    
    # Splitting the data into train and test set 
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Training the model
    model.fit(X_train, y_train)
    
    # Predicting the results on the training set and the test set
    train_accs.append(accuracy_score(y_train, model.predict(X_train)) * 100)
    test_accs.append(accuracy_score(y_test, model.predict(X_test)) * 100)

In [None]:
print("\tTraining \t Test")
for i in range(1, num_splits+1):
    print(i, "\t", train_accs[i-1], "\t", test_accs[i-1])

In [None]:
print("Average training set accuracy: {:.2f}".format(sum(train_accs) / num_splits))
print("Average test set accuracy: {:.2f}".format(sum(test_accs) / num_splits))

<hr>


# Conclusion

* The useful insights obtained after doing exploratory data analysis on the data are:
    1. DrugY is only used if the Na_to_K ratio is above 15 but still its used the most.
    2. drugA is only used if the age is below 50 and drugB is used if the age is above 50.
    3. drugC is not used if the cholesterol levels are normal.
    4. drugC is used only if the BP level is low, drugB and drugA are used only if the BP level is high, and drugX can be used for normal or low BP levels.
 <br>
 <br>
* Classification using **SVM with K-Fold Cross Validation** was done on the dataset which gave a test accuracy of **97%** 

<hr>
    
**Kindly upvote if you like the work. If you have any suggestions or queries, leave a comment.** <br>
**Thank you!**
