#### Drug Classification

This database contains information about certain drug types.

#### <b> <u> Context: </u> </b>

Since as a beginner in machine learning it would be a great opportunity to try some techniques to predict the outcome 
of the drugs that might be accurate for the patient.

#### <b> <u> Content: </u> </b>

The target feature is
Drug type<br>

The feature sets are:
1. Age<br>
2. Sex<br>
3. Blood Pressure Levels (BP)<br>
4. Cholesterol Levels<br>
5. Na to Potassium Ration<br>
6. Inspiration<br>
The main problem here in not just the feature sets and target sets but also the approach that is taken in <br>
solving these types of problems as a beginner. So best of luck.<br>

In [None]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing Pandas and NumPy
import pandas as pd, numpy as np

In [None]:
# Importing Pandas and NumPy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree

In [None]:
# Importing all datasets
drug_classification = pd.read_csv("/kaggle/input/drug-classification/drug200.csv")
drug_classification.head(2)

In [None]:
drug_classification.groupby(by = "Drug").median()

#### `PIE-CHART` CATEGORIES

In [None]:
labels = drug_classification['Drug'].astype('category').cat.categories.tolist()
counts = drug_classification['Drug'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True) #autopct is show the % on plot
ax1.axis('equal')
plt.show()

In [None]:
drug_classification.shape

### Inspecting the Null Values 

In [None]:
drug_classification.isnull().sum()

In [None]:
# missing values
round(100*(drug_classification.isnull().sum())/len(drug_classification), 2)

In [None]:
drug_classification.dtypes

### Data Analysis - Univariate And Bivariate Analysis

### <u> <b> Univariate Analysis </b> </u>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("dark_background")

In [None]:
## Lead Number Analysis per Country 
plt.figure(figsize= [15,4])
plt.xticks(rotation = 90)
sns.histplot(drug_classification["Drug"], stat="frequency",color="red")
plt.show()

In [None]:
## TotalVisits Analysis 
plt.figure(figsize= [10,4])
plt.xticks(rotation = 90)
sns.histplot(drug_classification["Cholesterol"], stat="count",color="red")
plt.show()

### Box - Plot

In [None]:
# Page Views Per Visit
plt.figure(figsize= [15,4])
sns.boxplot(drug_classification["Na_to_K"])

#### IQR - Inter Quartile Range

In [None]:
# Inter Quartile Range
Q1 = drug_classification["Na_to_K"].quantile(0.05)
Q3 = drug_classification["Na_to_K"].quantile(0.95)
IQR = Q3 - Q1
print("The interQuartile Range :",IQR)

In [None]:
drug_classification.columns

### <u> Outlier Treatment </u>

In [None]:
# Checking for outliers in the continuous variables
num_grouped_df = drug_classification.groupby(by = "Drug").sum()

In [None]:
# Checking outliers at 25%, 50%, 75%, 90%, 95% and 99%
num_grouped_df.describe(percentiles=[.25, .5, .75, .90, .95, .99])

### Label Encoding

In [None]:
# import preprocessing from sklearn
from sklearn import preprocessing

# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()


# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
drug_classification_2 = drug_classification.apply(le.fit_transform)
drug_classification_2.head(5)

In [None]:
# Checking for outliers in the continuous variables
num_grouped_df = drug_classification_2.groupby(by = "Drug").sum()

In [None]:
# Checking outliers at 25%, 50%, 75%, 90%, 95% and 99%
num_grouped_df.describe(percentiles=[.25, .5, .75, .90, .95, .99])

### Removing the `Outliers` 

In [None]:
# removing (statistical) outliers
Q1 = drug_classification_2["Age"].quantile(0.05)
Q3 = drug_classification_2["Age"].quantile(0.95)
IQR = Q3 - Q1
drug_classification_2 = drug_classification_2[(drug_classification_2["Age"] >= Q1 - 1.5*IQR) & (drug_classification_2["Age"] <= Q3 + 1.5*IQR)]

# outlier treatment 
Q1 = drug_classification_2["Sex"].quantile(0.05)
Q3 = drug_classification_2["Sex"].quantile(0.95)
IQR = Q3 - Q1
drug_classification_2 = drug_classification_2[(drug_classification_2["Sex"] >= Q1 - 1.5*IQR) & (drug_classification_2["Sex"] <= Q3 + 1.5*IQR)]

# outlier treatment
Q1 = drug_classification_2["BP"].quantile(0.05)
Q3 = drug_classification_2["BP"].quantile(0.95)
IQR = Q3 - Q1
drug_classification_2 = drug_classification_2[(drug_classification_2["BP"] >= Q1 - 1.5*IQR) & (drug_classification_2["BP"] <= Q3 + 1.5*IQR)]

# outlier treatment
Q1 = drug_classification_2["Cholesterol"].quantile(0.05)
Q3 = drug_classification_2["Cholesterol"].quantile(0.95)
IQR = Q3 - Q1
drug_classification_2 = drug_classification_2[(drug_classification_2["Cholesterol"] >= Q1 - 1.5*IQR) & (drug_classification_2["Cholesterol"] <= Q3 + 1.5*IQR)]

# outlier treatment
Q1 = drug_classification_2["Na_to_K"].quantile(0.05)
Q3 = drug_classification_2["Na_to_K"].quantile(0.95)
IQR = Q3 - Q1
drug_classification_2 = drug_classification_2[(drug_classification_2["Na_to_K"] >= Q1 - 1.5*IQR) & (drug_classification_2["Na_to_K"] <= Q3 + 1.5*IQR)]



### Test-Train Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Putting feature variable to X
X = drug_classification_2.drop(['Drug'], axis=1)

X.head()

In [None]:
# Putting response variable to y
y = drug_classification_2['Drug']

y.head()

In [None]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

### Dimensions Of The Datasets

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
X_test.head(5)

In [None]:
id = X_test.index

In [None]:
X_test.columns

In [None]:
y_test.shape

In [None]:
y_test.value_counts()

### Feature Scaling

- Scaling is important for the features to work. The ML algorithm will not accept the values without being scaled.
  There are `two types of Scaling` used in industry: <br>
  1) Standard Scaling <br>
  2) Min - Max Scaling <br>
- Here, we are using Standard Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
X_train.columns

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)


In [None]:
X_test = scaler.transform(X_test)

#### Pipeline Creation

In [None]:
## Pipelines Creation
## 1. Data Preprocessing by using Standard Scaler
## 2. Reduce Dimension using PCA
## 3. Apply  Classifier

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=0))])

In [None]:
pipeline_dt=Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])

In [None]:
pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])

In [None]:
pipeline_gradientboosting=Pipeline([('scalar4',StandardScaler()),
                     ('pca4',PCA(n_components=2)),
                     ('gb_classifier',GradientBoostingClassifier())])

In [None]:
## LEts make the list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest, pipeline_gradientboosting]

In [None]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [None]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest', 3:'GradientBoosting'}

# Fit the pipelines
for pipe in pipelines:
	pipe.fit(X_train, y_train)

In [None]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

In [None]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

### <b> Model Building

In [None]:
from sklearn.linear_model import LogisticRegression 
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import warnings

In [None]:

# Building a classification model using one vs rest method

# Fitting the model with training data
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
oneVsrest = OneVsRestClassifier(LR)
oneVsrest.fit(X_train, y_train)

### <b> Model Prediction </b>

In [None]:
# Making a prediction on the test set
prediction_oneVsRest = oneVsrest.predict(X_test)
   
# Evaluating the model

print(f"Test Set Accuracy: {accuracy_score(y_test, prediction_oneVsRest) * 100} %\n\n")
print(f"Classification Report: \n\n{classification_report(y_test, prediction_oneVsRest)}")

<b> Accuracy : </b> Accuracy is the most intuitive performance measure and it is simply a ratio of correctly predicted observation to the total observations.The formula is given as: <br>
<b> *Accuracy = True Positives + True Negatives/True Positives+False Positives+False Negatives+True Positives* </b> <br> <br>
<b> Precision : </b> The quality of being exact and refers to how close two or more measurements are to each other, regardless of whether those measurements are accurate or not. The formula is : <br>
<b> *Precision = True Positives / (True Positives + False Positives)* </b> <br> <br>
<b> Recall : </b> It is calculated as the number of true positives divided by the total number of true positives and false negatives. The result is a value between 0.0 for no recall and 1.0 for full or perfect recall. The formula is : <br>
<b> *Recall = True Positives / (True Positives + False Negatives)* </b> <br> <br>
<b> F1 score : </b> F1 Score is the weighted average of Precision and Recall. Therefore, this score takes both false positives and false negatives into account.The highest possible value of an F-score is 1.0, indicating perfect precision and recall, and the lowest possible value is 0, if either the precision or the recall is zero. The formula is : <br>
<b> *F1 score = 2\*((precision\*recall)/(precision+recall))* </b> <br> <br>

### <b>Analysing the probabilties and classification values </b>

In [None]:
# Classes for which individual models are created
print(oneVsrest.classes_) 

#Coefficient matrix for all the models created
print(oneVsrest.coef_.shape)
print("\n Intercept Values")
print(oneVsrest.intercept_)
print("\n Coefficient Values")
Coeff_array = oneVsrest.coef_
print(Coeff_array)

In [None]:
prediction_oneVsRest

In [None]:
print(f"Classification Report: \n\n{classification_report(y_test, prediction_oneVsRest)}")

### `Hyper Parameter` Tuning

In [None]:
### Manual Hyperparameter Tuning
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
model=LogisticRegression(  penalty='l2',
    dual=False,
    tol=0.0001,
    C=1.0,
    fit_intercept=True,
    intercept_scaling=1,
    class_weight=None,
    random_state=None,
    solver='lbfgs',
    max_iter=100,
    multi_class='auto',
    verbose=0,
    warm_start=False,
    n_jobs=None,
    l1_ratio=None,).fit(X_train,y_train)
predictions=model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))

##### `Randomized` Search Cv

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
solver = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['none', 'l1', 'l2', 'elasticnet']
C = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]

random_grid = {'solver': solver,
               'penalty': penalty,
               'C': C,
               }
print(random_grid)

In [None]:
rf=LogisticRegression()
rf_randomcv=RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=500, scoring='accuracy', n_jobs=-1, random_state=1)
### fit the randomized model
rf_randomcv.fit(X_train,y_train)

In [None]:
rf_randomcv.best_params_

In [None]:
rf_randomcv

In [None]:
best_random_grid=rf_randomcv.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score
y_pred=best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

#### GridSearch CV

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'solver': [rf_randomcv.best_params_['solver']],
    'penalty': [rf_randomcv.best_params_['penalty']],
    'C': [rf_randomcv.best_params_['C'], 
          rf_randomcv.best_params_['C']+2, 
          rf_randomcv.best_params_['C'] + 4]
}

print(param_grid)

In [None]:
#### Fit the grid_search to the data
rf=LogisticRegression()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

In [None]:
grid_search.best_estimator_

In [None]:
best_grid=grid_search.best_estimator_

In [None]:
best_grid

In [None]:
y_pred=best_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))


In [None]:
print("Classification Report:\n {} ".format(classification_report(y_test,y_pred)))

In [None]:
print("Accuracy Of :", "`93.33 %` achieved")

In [None]:
y_pred

In [None]:
X_test = pd.DataFrame(X_test)

In [None]:
X_test = X_test.rename(columns = {
                                 0:"Age",
                                 1:"Sex",
                                 2:"BP",
                                 3:"Cholesterol",
                                 4:"Na_to_K",
                                 5:"Drug"
})

In [None]:
X_test.head()

In [None]:
drug_classification = drug_classification.reset_index()

In [None]:
drug_classification.head(4)

In [None]:
id = pd.DataFrame(id)

In [None]:
drug_classification2 = drug_classification.join(id,how="inner")

In [None]:
drug_classification2.head(3)

In [None]:
drug_classification2 = drug_classification2.drop(columns = 0)

In [None]:
y_pred = pd.DataFrame(y_pred)

In [None]:
y_pred = y_pred.reset_index()

In [None]:
y_pred.head(3)

In [None]:
drug_classification2 = drug_classification2.join(y_pred,lsuffix='xindex',rsuffix='yindex')

In [None]:
drug_classification2.head(3)

In [None]:
drug_classification2 = drug_classification2.drop(columns = ["indexxindex","indexyindex"])

In [None]:
drug_classification2 = drug_classification2.rename(columns ={0: "Pred"} )

In [None]:
drug_classification2.head(6)

In [None]:
labels = drug_classification2['Pred'].astype('category').cat.categories.tolist()
counts = drug_classification2['Pred'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
plt.title("Categorical Pedictions")
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True) #autopct is show the % on plot
ax1.axis('equal')
plt.show()


In [None]:
# List of variables to map

varlist =  ['Pred']

# Defining the map function
def binary_map(x):
    return x.map({0: "Drug Y", 1: "drugC", 2:"drugX", 3:"drugA", 4:"drugB"})

# Applying the function to the housing list
drug_classification2[varlist] = drug_classification2[varlist].apply(binary_map)

In [None]:
drug_classification2.head(5)

In [None]:
labels = drug_classification2['Pred'].astype('category').cat.categories.tolist()
counts = drug_classification2['Pred'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
plt.title("Categorical PedictionsOn Test DataSet")
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True) #autopct is show the % on plot
ax1.axis('equal')
plt.show()

In [None]:
labels = drug_classification2['Drug'].astype('category').cat.categories.tolist()
counts = drug_classification2['Drug'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
plt.title("Categorical Given On Test Dataset")
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True) #autopct is show the % on plot
ax1.axis('equal')
plt.show()

In [None]:
drug_classification2.to_csv(r'/kaggle/working/submission.csv', index=False)