In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Import libraries to graph

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('darkgrid')

## Data Characteristics

In [None]:
drug = pd.read_csv('/kaggle/input/drug-classification/drug200.csv')

drug.head()

In [None]:
print(drug.isna().sum())
print("------------------------")
print(drug.isnull().sum())

In [None]:
print(drug.info())
print('===========================')
print(drug.describe())

## Graph Drug->Count

In [None]:
drug.Drug.value_counts()

plt.figure(figsize=(9,5))

sns.countplot(drug.Drug)
plt.title('Count-Drug')
plt.show()

## Graph Sex->Count

In [None]:
plt.figure(figsize=(9,5))

sns.countplot(drug.Sex)
plt.title('Count-Sex')
plt.show()

## Graph Age->Dist

In [None]:
plt.figure(figsize=(9,5))

print('Age-min: {}'.format(drug.Age.min()))
print('Age-max: {}'.format(drug.Age.max()))

sns.distplot(drug.Age)
plt.title('Dist-Age')
plt.show()

## Graph BP->Count

In [None]:
print(drug.BP.value_counts())

plt.figure(figsize=(9,5))

sns.countplot(drug.BP)
plt.title('BP-Count')
plt.show()

## Graph Na_to_K->Dist

In [None]:
plt.figure(figsize=(9,5))

print('Na_to_K min: {}'.format(drug.Na_to_K.min()))
print('Na_to_K max: {}'.format(drug.Na_to_K.max()))

sns.distplot(drug.Na_to_K)
plt.title('Dist-Na_to_K')
plt.show()

## Graph Cholesterol->Count

In [None]:
print(drug.Cholesterol.value_counts())

plt.figure(figsize=(9,5))

sns.countplot(drug.Cholesterol)
plt.title('Count-Cholesterol')
plt.show()


# Analysis all with Drug

## Graph Drug->Na_to_K

In [None]:
plt.figure(figsize=(9,5))

sns.swarmplot(x='Drug', y='Na_to_K', data=drug)
plt.title('Drug-Na_to_K')
plt.show()

print('min DrugY: {}'.format(drug.Na_to_K[drug.Drug == 'DrugY'].min()))


# An important characteristic for DrugY

## Graph Drug->Age

In [None]:
plt.figure(figsize=(9,5))

sns.swarmplot(x='Drug', y='Age', data=drug)
plt.title('Drug-Age')
plt.show()

#less important

## Graph Drug->Sex

In [None]:
DrugSex = drug.groupby(['Drug', 'Sex']).size().reset_index(name='Count')

plt.figure(figsize=(9,5))

sns.barplot(x='Drug', y='Count', hue='Sex', data=DrugSex)
plt.title('Drug-Sex')
plt.show()

# Nothing important

## Graph Drug->BP

In [None]:
DrugBP = drug.groupby(['Drug', 'BP']).size().reset_index(name='Count')

plt.figure(figsize=(9,5))

sns.barplot(x='Drug', y='Count', hue='BP', data=DrugBP)
plt.title('Drug-BP')
plt.show()


# Drugs A and B are only used by people with high blood pressure.
# Drug C is only used by people with low blood pressure.
# Drug X is not used for patients with high blood pressure.


## Graph Drug->Cholesterol

In [None]:
DrugChol = drug.groupby(['Drug', 'Cholesterol']).size().reset_index(name='Count')

plt.figure(figsize=(9,5))

sns.barplot(x='Drug', y='Count', hue='Cholesterol', data=DrugChol)
plt.title('Drug-Cholesterol')
plt.show()

# Drug C is only used for patients with high cholesterol.
# The drugs (A, B, and Y) appear to be balanced.
# Drug X is used in both cases, but more in patients with high cholesterol.

# WE LOOK FOR RELATIONSHIPS BETWEEN VARIABLES

## Figure 1. Graph Drug->Na_to_K with Cholesterol
## Figure 2. Graph Drug->Na_to_K with BP

In [None]:
plt.figure(figsize=(13,10))

plt.subplot(211)
sns.swarmplot(x='Drug', y='Na_to_K', hue='Cholesterol', data=drug)
plt.title('Drug-Na_to_K-Cholesterol')
plt.subplot(212)
sns.swarmplot(x='Drug', y='Na_to_K', hue='BP', data=drug)
plt.title('Drug-Na_to_K-BP')

plt.show()


# Drug C is used only for people with high cholesterol and low blood pressure.
# We create a new column where the value will be 0 if Na_to_K is less than 15.015 and 1 if it is greater than this value.
# Na_to_k is very important


## We can create a new variable with the information from the Na_to_K column

### 0 <- if it is less than 15,015
### 1 <- if it is greater than 15,015

In [None]:
drug['GreaterNa15'] = [1 if i >= 15.015 else 0 for i in drug.Na_to_K]

drug.head()

## Graph New Column Drug->GreaterNa15

In [None]:
DrugGreater = drug.groupby(['Drug','GreaterNa15']).size().reset_index(name='Count')

plt.figure(figsize=(9,5))

sns.barplot(x='Drug', y='Count', hue='GreaterNa15', data = DrugGreater)
plt.title('Drug-GreaterNa15')
plt.show()

# Imputer data

In [None]:
from sklearn.impute import SimpleImputer

def impute(value, imp, dta):
    
    for i, j in zip(value, imp):
        
        impt = SimpleImputer(missing_values=i, strategy='constant', fill_value=j)
        
        impt.fit(drug[dta][:, np.newaxis])
        
        drug[dta] = impt.transform(drug[dta][:, np.newaxis])
        
    drug[dta] = drug[dta].astype(int)
    
    

### Values we use to impute

### run only once

In [None]:
feature = ['BP', 'Cholesterol']
Vimp = [[0,1,2],[0,1]]
Val = [['LOW','NORMAL','HIGH'],['NORMAL', 'HIGH']]

def imp(value, imput, feature):
    
    for i, j in enumerate(feature):
        
        impute(value[i], imput[i], j)
    
imp(Val, Vimp, feature)

In [None]:
drug.head()

# Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

def Encoder(feature):
    
    le = LabelEncoder()
    
    drug[feature] = le.fit_transform(drug[feature])


### Encoder Drug

In [None]:
Encoder('Drug')

drug.head()

In [None]:
drug.info()

# Graph Boxplot
### we look for outliers

In [None]:
def boxplot(dta):
    
    plt.figure(figsize=(9,5))
    
    sns.boxplot(drug[dta])
    plt.title(dta)
    plt.show()
    
boxplot('Na_to_K')

# Outliers

## Removing outliers
#### repeat the process as many times as necessary

In [None]:
# Removing outliers
for i in range(2):
    
    mn = np.min(drug.Na_to_K)
    mx = np.max(drug.Na_to_K)

    Q1 = drug.Na_to_K.quantile(0.25)
    Q3 = drug.Na_to_K.quantile(0.75)
    median = drug.Na_to_K.median()
    IQR = Q3 - Q1

    Qinf = Q1 - 1.5*IQR
    if Qinf < mn:

        Qinf = mn

    Qsup = Q3 + 1.5*IQR
    if Qsup > mx:

        Qsup = mx

    drug = drug[(drug.Na_to_K >= Qinf) & (drug.Na_to_K <= Qsup)]

    print('Min: {}'.format(mn))
    print('Max: {}'.format(mx))
    print('Quantile 25%: {}'.format(Q1))
    print('Median: {}'.format(median))
    print('Quantile 75%: {}'.format(Q3))
    print('Interquartile range: {}'.format(IQR))
    print('Lower quantile: {}'.format(Qinf))
    print('Upper quantile: {}'.format(Qsup))
    print('Shape: {}'.format(drug.shape))

    boxplot('Na_to_K')

# We prepare data for model training
#### -without the sex column because it does not have important information

In [None]:
frame = [drug.Age, drug.BP, drug.Cholesterol, drug.Na_to_K, drug.GreaterNa15]

X_data = pd.concat(frame, axis=1)
y = drug.Drug.copy()


# Scaling Data
## data = data / mean(data)

In [None]:
# Data scaling

X = X_data / np.mean(X_data)

X.head()

# train_test_split and LogisticRegresion(parameters)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=23)

lr = LogisticRegression(penalty='l2', C=0.67, solver='newton-cg', multi_class='multinomial', max_iter=100, random_state=34)
lr.fit(X_train, y_train)


## Classification metric functions

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def ClasReport(model, y, X):
    
    pred = model.predict(X)
    
    cm_lr = confusion_matrix(y, pred)
    
    print(classification_report(y, pred))
    
    sns.heatmap(cm_lr, annot=True, cbar=False, cmap='Blues')
    
    plt.show()

## Print metrics

In [None]:
print('\t\t\tScore TEST: {}'.format(lr.score(X_test, y_test)))
ClasReport(lr, y_test, X_test)

print('\t\t======================================')

print()
print('\t\t\tScore TRAIN: {}'.format(lr.score(X_train, y_train)))
print(lr.score(X_train, y_train))
ClasReport(lr, y_train, X_train)

# Cross_val_score

In [None]:
from sklearn.model_selection import cross_val_score

def val_cross(model, _X, _y):

    print('5 CV:\n{}'.format(cross_val_score(model, _X, _y, cv=5)))
    print('Total: {}'.format(sum(cross_val_score(model, _X, _y, cv=5))/5))

    print('4 CV:\n{}'.format(cross_val_score(model, _X, _y, cv=4)))
    print('Total: {}'.format(sum(cross_val_score(model, X, y, cv=4))/4))

    print('3 CV:\n{}'.format(cross_val_score(model, _X, _y, cv=3)))
    print('Total: {}'.format(sum(cross_val_score(model, _X, _y, cv=3))/3))
    
val_cross(lr, X, y)

# We balance the data

1. swing with SMOTETomek

In [None]:
from imblearn.combine import SMOTETomek

SmoteT = SMOTETomek(sampling_strategy='auto', random_state=0)

X_res, y_res = SmoteT.fit_sample(X, y)

print('X-X_res shape: {}-{}'.format(X.shape, X_res.shape))
print('y-y_res shape: {}-{}'.format(y.shape, y_res.shape))

### cross_val_score with SMOTETomek

In [None]:
val_cross(lr, X_res, y_res)

2. swing with SMOTEENN

In [None]:
from imblearn.combine import SMOTEENN

SmoteENN = SMOTEENN(sampling_strategy='auto', random_state=0)

Xres, yres = SmoteENN.fit_sample(X, y)

print('X-Xres shape: {}-{}'.format(X.shape, Xres.shape))
print('y-yres shape: {}-{}'.format(y.shape, yres.shape))

### cross_val_score with SMOTEENN

In [None]:
val_cross(lr, Xres, yres)

# Conclusion

> As we can see, the probabilistic model (logistic regression) gives us a very good performance, of course entering the correct parameters.

> We rule out over-adjustment, due to the minimum difference in the training and validation metrics, and we can also observe that balancing the data, it maintains its performance; we can verify this in the cross-validation scores for these data.