# Importing Libraries

In [None]:
#Importing Libraries


#Graphic Libraries
import seaborn as sns
import plotly.graph_objects as go

#Statistical Inference Analyis
import statsmodels.api as sm
import scipy.stats as stats

#Loading Dataset
import numpy as np 
import pandas as pd

#Data Processing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

#Feature Selection
from sklearn.model_selection import train_test_split

#Model
from sklearn.svm import SVC


#Validation 
from sklearn.metrics import f1_score

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings(action='ignore')

# Loading the Dataset

In [None]:
df = pd.read_csv('../input/drug-classification/drug200.csv')
print(df.shape)
df.head()

In [None]:
labels = df['Drug'].unique()
values = df['Drug'].value_counts()


fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.5)])
fig.show()

The Dataset is imbalanced as the DrugB and DrugA is in low proprotion as compared to DrugY 

In [None]:
df.head()

# Content
1. The target feature is
* Drug type
2. The feature sets are:
* Age
* Sex
* Blood Pressure Levels (BP)
* Cholesterol Levels
* Na to Potassium Ration

# EDA

In [None]:
df.isnull().sum()

In [None]:
for col in df.select_dtypes('O').columns:
    print('We have {} unique values in {} column : {}'.format(len(df[col].unique()),col,df[col].unique()))
    print('__'*30)


In [None]:
df['Drug'].value_counts(normalize=True).plot.bar(color=['green','blue','purple','red'],edgecolor='black',title='target variable')

* Around 12% people consume DrugA 
* Around 8% people consume DrugB
* Around 8% people consume DrugC
* Around 27% people consume DrugX
* Around 45% people cosnsume DrugY



In [None]:
df['Sex'].value_counts(normalize=True).plot.bar(color=['cyan','magenta'],edgecolor='black',title='sex variable')

* Around 52% are Male
* Around 48% are Female

In [None]:
df['Cholesterol'].value_counts(normalize=True).plot.bar(color=['orange','yellow'],edgecolor='black',title='Cholestrol variable')

* Around 52% people have High Cholesterol
* Around 48% people have Normal Cholesterol

In [None]:
df['BP'].value_counts(normalize=True).plot.bar(color=['orange','yellow','brown'],edgecolor='black',title='Blood Pressure variable')

* Around 38% people have High blood pressure
* Around 32% people have Low blood pressure
* Around 30% people have Normal blood pressure

# Features Conversion

In [None]:
bp = ['HIGH','LOW','NORMAL']
cholestrol = ['HIGH','NORMAL']
sex = ['M','F']
ordi = OrdinalEncoder(categories=[bp,cholestrol,sex])
DF = pd.DataFrame(ordi.fit_transform(df[['BP','Cholesterol','Sex']]),columns = ['BP_','Cholesterol_','Sex_'])

In [None]:
df = pd.concat([df,DF],axis = 1)
df.drop(['Sex','BP','Cholesterol'],axis = 1,inplace = True)

In [None]:
LE = LabelEncoder()
df['Drug_']=LE.fit_transform(df['Drug'])
df.drop('Drug',axis = 1,inplace = True)

In [None]:
df.head()

# Spliting the data into Training & Testing Set

In [None]:
X = df.drop('Drug_',axis = 1)
y = df.Drug_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,y,test_size=0.3, shuffle=True)

# Statistical Inference

In [None]:
log_reg = sm.OLS(y,X).fit()
print(log_reg.summary())

In [None]:
log_reg.params

Note :
These are thre model parameters

In [None]:
log_reg.rsquared

The OLS model is giving 68% accuracy

# **Model Selection**

 **Support Vector Machine Model**

In [None]:
clf =SVC(kernel='linear')
clf.fit(X_train, y_train)
clf_SVC_score = cross_val_score(clf,X_train,y_train)
clf_SVC_predict = cross_val_predict(clf,X_train,y_train)
score = f1_score(y_train,clf_SVC_predict,average = 'micro')
print(f"The accuracy of the model is {np.round_(score,decimals=2,out=None)*100}")