In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Drug Classification

<font color = "blue">
Content:  

1. [Dataset](#1)
2. [Visualization (Görselleştirme)](#2)
3. [Missing Value and Outlier](#3)
4. [Train/Test Split](#4)
5. [Standardize](#5)
6. [Classification](#6)
    * [KNN](#7)
    * [Naive Bayes](#8)
    * [Decision Tree](#9)
    * [Support Vector Machine](#10)
    * [Voting Classifier](#11)

<a id="1"></a><br>
# Dataset

* Age -> min=15, max=74
* Sex -> F, M
* BP -> HIGH, NORMAL, LOW
* Cholesterol -> HIGH, NORMAL
* Na_to_K (Na to Potassium Ration)
* Drug -> DrugY, drugX, drugA, drugB, drugC

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
data = pd.read_csv("../input/drug-classification/drug200.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

<a id="2"></a><br>
# Visualization

In [None]:
sns.countplot(data["Drug"])
print(data.Drug.value_counts())

In [None]:
plt.plot(data.Na_to_K[data.Drug == "DrugY"]) # Diğerlerinden kolayca ayrılıyor.
plt.plot(data.Na_to_K[data.Drug == "drugx"])
plt.plot(data.Na_to_K[data.Drug == "drugA"])
plt.plot(data.Na_to_K[data.Drug == "drugB"])
plt.plot(data.Na_to_K[data.Drug == "drugC"])

plt.show()

In [None]:
sns.countplot(data["Cholesterol"])
print(data.Cholesterol.value_counts())

In [None]:
sns.countplot(data["Age"])
plt.show()

In [None]:
sns.countplot(data["BP"])
print(data.BP.value_counts())

<a id="3"></a><br>
# Missing Value and Outlier

In [None]:
data.isnull().sum()

In [None]:
data['Sex'].replace({'M', 'F'},{1, 0}, inplace=True)
data['BP'].replace({'HIGH', 'LOW', 'NORMAL'},{1, 2, 3}, inplace=True)
data['Cholesterol'].replace({'HIGH', 'NORMAL'},{1, 0}, inplace=True)

In [None]:
data.boxplot(column="Na_to_K")
plt.show()

In [None]:
describe = data.describe()
Na_to_K_desc = describe["Na_to_K"]
Na_to_K_desc

In [None]:
data.drop(data[data["Na_to_K"] > 31].index, inplace=True)

In [None]:
data.boxplot(column="Na_to_K")
plt.show()

<a id="4"></a><br>
# Train/Test Split

In [None]:
x = data.drop(['Drug'], axis=1)
y = data['Drug']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
from sklearn.metrics import accuracy_score, plot_confusion_matrix

In [None]:
print("X_train shape:",x_train.shape)
print("X_test shape:",x_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

<a id="5"></a><br>
# Standardize

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
data_stan = scaler.fit_transform(x)

data_stan = pd.DataFrame(data_stan, columns=x.columns)
data_stan.head()

<a id="6"></a><br>
# Classification 

<a id="7"></a><br>
## KNN

In [None]:
# knn model -> k = 3
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3) # n_neighbors = k
knn.fit(x_train,y_train)
prediction = knn.predict(x_test)
print(" {} nn score: {} ".format(3,knn.score(x_test,y_test)))

# find k value
score_list = []
for each in range(1,15):
    knn2 = KNeighborsClassifier(n_neighbors = each)
    knn2.fit(x_train,y_train)
    score_list.append(knn2.score(x_test,y_test))
    
plt.plot(range(1,15),score_list)
plt.xlabel("k values")
plt.ylabel("accuracy")
plt.show()

# knn model -> k = 1
knn = KNeighborsClassifier(n_neighbors = 1) # n_neighbors = k
knn.fit(x_train,y_train)
prediction = knn.predict(x_test)
print(" {} nn score: {} ".format(1,knn.score(x_test,y_test)))

<a id="8"></a><br>
## Naive Bayes 

In [None]:
from sklearn.naive_bayes import GaussianNB 
gnb = GaussianNB() 
gnb.fit(x_train, y_train) 

print("score: ", gnb.score(x_test,y_test))

<a id="9"></a><br>
## Decison Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)

print("score: ", dt.score(x_test,y_test))

<a id="10"></a><br>
## Support Vector Machine

In [None]:
from sklearn.svm import SVC
 
svm = SVC(random_state = 1)
svm.fit(x_train,y_train)

print("score: ", svm.score(x_test,y_test))

<a id="11"></a><br>
## Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

voting = VotingClassifier(estimators=[("dt",dt),("knn",knn),("svm",svm),("gnb",gnb)])

for i in (dt, knn, svm, voting):
    i.fit(x_train, y_train)
    y_pred = i.predict(x_test)
    print(i, "= ",accuracy_score(y_test, y_pred))