In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
data = pd.read_csv("../input/drug-classification/drug200.csv")

In [None]:
data

# Data Cleaning

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data["Drug"].replace({"DrugY": "Y", "drugX": "X","drugA":"A","drugC":"C","drugB":"B"}, inplace=True)

# EDA

In [None]:
def count_values(dataframe):
    categorical=dataframe.columns.values.tolist()
    for col in categorical:
        print(f"Value Counts in {col} is: \n {dataframe[col].value_counts()}")
        print("\n")
count_values(data)

In [None]:
import matplotlib.pyplot as plt
data['Drug'].value_counts().head(5).plot.pie()

data_feyn=data
plt.gca().set_aspect('equal')

In [None]:
def var_distribution2(dataframe):
    import matplotlib.pyplot as plt
    numbers = pd.Series(dataframe.columns)
    dataframe[numbers].hist(figsize=(14,14))
    plt.show();
    return dataframe.var()
var_distribution2(data)

In [None]:
sns.catplot(x="Cholesterol", kind="count", palette="ch:.25", data=data)

In [None]:
sns.catplot(x="BP", kind="count", palette="ch:.25", data=data)

In [None]:
sns.catplot(x="Sex", kind="count", palette="ch:.25", data=data)

In [None]:
data.groupby("Drug").mean()

In [None]:
sns.catplot(x="Sex", y="Age", hue="Drug", kind="point", data=data)


In [None]:
sns.catplot(x="Sex", y="Na_to_K", hue="Drug", kind="point", data=data)

In [None]:
sns.violinplot(x=data.Drug, y=data.Age)

In [None]:
sns.violinplot(x=data.Drug, y=data.Na_to_K)

In [None]:
sns.FacetGrid(data, hue="Drug", height=6) \
   .map(sns.histplot, "Age") \
   .add_legend();
plt.suptitle("Age",size=28)
plt.show();

In [None]:
sns.FacetGrid(data, hue="Drug", height=6) \
   .map(sns.histplot, "Na_to_K") \
   .add_legend();
plt.suptitle("Na_to_K",size=28)
plt.show();

In [None]:
def corr(dataframe,target_variable):
    import seaborn as sns
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(15,15))
    correlation_matrix = dataframe.corr().round(2)
    sns.heatmap(data=correlation_matrix, annot=True)
    
    correlation = data.corr()[target_variable].abs().sort_values(ascending = False)
    return correlation
corr(data,"Drug")

In [None]:
def Outliers(dataframe,cols):
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    numeric_col2=[]
    for x in cols:
        numeric_col2.append(x)

    fig=make_subplots(rows=1, cols=len(cols))

    for i,col in enumerate(numeric_col2):
        fig.add_trace(go.Box(y=dataframe[col].values, name=dataframe[col].name), row=1, col=i+1)

    return fig.show()

cols=data.columns.values.tolist()
Outliers(data,cols)

# Data Preprocessing

In [None]:
from sklearn import preprocessing
data2=data
subset=["Age","Na_to_K","Drug"]
data2=data2.drop(subset,axis=1)
data2=data2.apply(preprocessing.LabelEncoder().fit_transform)
data=data.drop(["Sex","BP","Cholesterol"],axis=1)
data=pd.concat([data,data2],axis=1)

data

In [None]:
def feature_selector(dataframe,feature_number,target_variable):
    from sklearn import datasets
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    
    dataframe2=dataframe[target_variable]
    dataframe=dataframe.drop([target_variable],axis=1)
     
    n=feature_number
    lr = LogisticRegression(solver="liblinear")
    rfe=RFE(lr,n)
    rfe=rfe.fit(dataframe,dataframe2)
    cols=[]
    for x in dataframe.columns.values.tolist():
        cols.append(x)
    ranking=[]
    for x in rfe.ranking_:
        ranking.append(x)
    n=0
    for x in rfe.support_:
        print(f"{ranking[n]}: {x}----> {cols[n]}")
        n+=1
    selected=[]
    n=0
    z=zip(dataframe.columns.values.tolist(),rfe.support_)
    z=list(z)
    for x in range(len(z)+1):
    
        try:
            if str(z[n][1])=="True":
                selected.append(z[n])
            else:
                pass
        except IndexError:
            pass
        n+=1
    cols_selected=[]   
    
    for x,y in selected:
        cols_selected.append(x)
    
    if len(cols_selected)==feature_number:
        return  cols_selected
    else:
        print("ERROR!. Cols_Selected does not meet feature_number requirements")

        
feature_selector(data,3,"Drug")


In [None]:
def VIF(dataframe,chosen_cols):
    from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
    from statsmodels.tools.tools import add_constant
    X=dataframe[chosen_cols]
    X=add_constant(X)
    vif_data=pd.DataFrame()
    vif_data["feature"]=X.columns
    vif_data["VIF"]=[VIF(X.values, i) for i in range(len(X.columns))]
    return vif_data

In [None]:
chosen_cols=['Na_to_K', 'BP', 'Cholesterol']
VIF(data,chosen_cols)

# Predicting

**LOGISTIC REGRESSION**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import pandas as pd
from sklearn import metrics
import numpy as np
X=data[chosen_cols]
Y=data["Drug"]
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,random_state=0)
lm=linear_model.LogisticRegression(max_iter=1000)
lm.fit(X_train,Y_train)
predict=lm.predict(X_test)
probs=lm.predict_proba(X_test)
prob=probs[:,1]
prob_df=pd.DataFrame(prob)
    

treshold = 0.25

    
print(f"Prediction Accuracy (Test): {metrics.accuracy_score(Y_test,predict)}")
print()
prob_df["prediction"]= np.where(prob_df[0]> treshold,1,0)
print(prob_df)
print(f"\nTreshold: {treshold}")
print()
con_tab=pd.crosstab(prob_df["prediction"],columns="Count")
print(f"Number of Positive Cases: {con_tab.values[1]/len(prob_df)*100}%")
con_tab
    

**DECISION TREE**

In [None]:
def sklearn_decision_tree_clasiffier(dataframe,chosen_cols,target_variable,max_depth):
    from sklearn.tree import DecisionTreeClassifier
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn import metrics
    X=dataframe[chosen_cols]
    Y=dataframe[target_variable]
    X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,random_state=0)
    
    tree=DecisionTreeClassifier(criterion="entropy",min_samples_split=int((len(dataframe)/16)),max_depth=max_depth)
    tree.fit(X_train,Y_train)
    predict=tree.predict(X_test)
    predict2=tree.predict(X_train)
    print(f"Prediction Accuracy (Train): {metrics.accuracy_score(Y_train,predict2)}")
    print(f"Prediction Accuracy (Test): {metrics.accuracy_score(Y_test,predict)}")
    

In [None]:
sklearn_decision_tree_clasiffier(data,chosen_cols,"Drug",5)

**RANDOM FOREST**

In [None]:
def sklearn_Random_Forest_Clasiffier(dataframe,chosen_cols,target_variable):
    from sklearn.ensemble import RandomForestClassifier
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn import metrics
  
    X=dataframe[chosen_cols]
    Y=dataframe[target_variable]
    X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,random_state=0)
    forest=RandomForestClassifier(n_jobs=2,n_estimators=10)
    forest.fit(X_train,Y_train)
    predict=forest.predict(X_test)
    predict2=forest.predict(X_train)
    print(f"Prediction Accuracy (Train): {metrics.accuracy_score(Y_train,predict2)}")
    print(f"Prediction Accuracy (Test): {metrics.accuracy_score(Y_test,predict)}")

In [None]:
sklearn_Random_Forest_Clasiffier(data,chosen_cols,"Drug")

**KNN**

In [None]:
def Sklearn_KNMC(dataframe,chosen_cols,target_variable):
    from sklearn.neighbors import KNeighborsClassifier
    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn import metrics
  
    X=dataframe[chosen_cols]
    Y=dataframe[target_variable]
    X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,random_state=0)
    KNNC=KNeighborsClassifier()
    KNNC.fit(X_train,Y_train)
    predict=KNNC.predict(X_test)
    predict2=KNNC.predict(X_train)
    print(f"Prediction Accuracy (Train): {metrics.accuracy_score(Y_train,predict2)}")
    print(f"Prediction Accuracy (Test): {metrics.accuracy_score(Y_test,predict)}")

In [None]:
Sklearn_KNMC(data,chosen_cols,"Drug")

**Our Best Model Is Log Regression and Random Forest With 95% Acc**