In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Dimension reduction
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


# Models
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
drugs = pd.read_csv("/kaggle/input/drug-classification/drug200.csv")

description = ["Age of patient","Gender of patients","Blood Pressure Levels",
       "Cholesterol levels","Soduim to potassium Ration in Blood","Drug Type"]
drugs_descript = pd.DataFrame({"Column":drugs.columns,
                              "Description":description})
print(drugs_descript)
print(drugs.head())

# drugs data has 200 rows and 6 columns with no missing values

In [None]:
# Summary statistics plot for the variables 
plt.figure(figsize=(14,13))

# Age
plt.subplot(3,2,1)
drugs.Age.plot(kind="hist", ec="black", color="orange")
plt.title("Age range")
# Age is ranged from 15 to 74 with majority of participants about 45 to 50 years old

# Na_to_K
plt.subplot(3,2,2)
drugs.Na_to_K.plot(kind="hist", ec="black", color="orange")
plt.title("Na_to_K range")

# Sex
plt.subplot(3,2,3)
print(drugs.Sex.value_counts())
drugs.Sex.value_counts().plot(kind="bar", rot=0)
plt.title("Sex")
# There are 96 Females and 104 Males in this dataset

# BP
plt.subplot(3,2,4)
print(drugs.BP.value_counts())
drugs.BP.value_counts().plot(kind="bar", rot=0)
plt.title("Blood Pressure")

# Cholesterol
plt.subplot(3,2,5)
print(drugs.Cholesterol.value_counts())
drugs.Cholesterol.value_counts().plot(kind="bar", rot=0)
plt.title("Cholesterol")

# Drug
plt.subplot(3,2,6)
print(drugs.Drug.value_counts())
drugs.Drug.value_counts().plot(kind="bar", rot=0)
plt.title("Drug")

In [None]:
# Correlation Matrix
sns.heatmap(drugs.corr(), annot=True)

In [None]:
plt.figure(figsize=(8,10))
# Boxplot of drug types
drugs.boxplot(column="Age",by="Drug" )
plt.tight_layout()\

# It seems like from the boxplots, patients of older ages of approximately 60 years old tend to be given drug B

In [None]:
# We try to aggregate age into age groups
AgeGroup = []
for patient in range(len(drugs)):
  if drugs.iloc[patient].Age >= 15 and drugs.iloc[patient].Age <= 25:
    AgeGroup.append("15 to 25")
  elif drugs.iloc[patient].Age > 25 and drugs.iloc[patient].Age <= 35:
    AgeGroup.append("26 to 35")
  elif drugs.iloc[patient].Age > 35 and drugs.iloc[patient].Age <= 45:
    AgeGroup.append("36 to 45")
  elif drugs.iloc[patient].Age > 45 and drugs.iloc[patient].Age <= 55:
    AgeGroup.append("45 to 55")
  elif drugs.iloc[patient].Age > 55 and drugs.iloc[patient].Age <= 65:
    AgeGroup.append("56 to 65")
  elif drugs.iloc[patient].Age > 65 and drugs.iloc[patient].Age <= 75:
    AgeGroup.append("66 to 74")

drugs["AgeGroup"] = AgeGroup
drugs.head()  

In [None]:
# Dataframe to contain model results
model_results = pd.DataFrame(columns=["Models","Accuracy Score","F1 Score"])

# Create a function that takes in different model and compute the metrics for comparison
def model_accuracy(X, Y, model, model_name, model_results=None):
  X_train, X_test, ytrain, ytest = train_test_split(X, Y, train_size=0.8, random_state=10)
  model = model.fit(X_train, ytrain)
  predictions  = model.predict(X_test)
  score = accuracy_score(predictions, ytest)  # accuracy score
  f1 = f1_score(predictions, ytest, average="macro") # F1 score  

  if model_results is not None:
    model_results = model_results.append(pd.DataFrame({"Models":model_name,
                                                        "Accuracy Score":score,
                                                          "F1 Score":f1},
                                                             index = [0]),
                                                              ignore_index = True)
    return model_results

In [None]:
# Scale the data first
Y = drugs.Drug
X = drugs.drop(["Drug","Age"], axis=1)
X_dummy = pd.get_dummies(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_dummy)

model_results = model_accuracy(X_scaled, Y, SVC(), "Support Vector Machine", model_results)
model_results = model_accuracy(X_scaled, Y, LinearDiscriminantAnalysis(), "Linear Discriminant Analysis", model_results)
model_results = model_accuracy(X_scaled, Y, LogisticRegression(), "Logistic Regression", model_results)
model_results = model_accuracy(X_scaled, Y, DecisionTreeClassifier(), "Decision Tree", model_results)
model_results = model_accuracy(X_scaled, Y, RandomForestClassifier(), "Random Forest", model_results)
model_results = model_accuracy(X_scaled, Y, GaussianNB(), "Naive Bayes", model_results)
print(model_results)