##Loading and reading of dataset

In [None]:
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt

In [None]:
dataf = pd.read_csv('../input/persistent-vs-nonpersistent/Persistent_vs_NonPersistent.csv')

In [None]:
dataf.head()

In [None]:
dataf.tail()

##Descriptive Analysis of dataset

In [None]:
dataf.describe

In [None]:
dataf.info()

In [None]:
print ("Rows     : " ,dataf.shape[0])
print ("Columns  : " ,dataf.shape[1])
print ("\nFeatures : \n" ,dataf.columns.tolist())

In [None]:
print ("\nMissing values :  ", dataf.isnull().any())

In [None]:
print ("\nUnique values :  \n",dataf.nunique())

In [None]:
dataf.isnull().sum()

In [None]:
dataf.value_counts('Persistency_Flag')

##Exploratory Data Analysis

In [None]:
import seaborn as sns

In [None]:
sns.countplot(x="Persistency_Flag",data=dataf, dodge=True)

In [None]:
sns.countplot(x="Persistency_Flag",hue='Tscore_Bucket_Prior_Ntm', data=dataf)

In [None]:
sns.countplot(x="Persistency_Flag",hue='Adherent_Flag', data=dataf)

In [None]:
sns.countplot(x="Persistency_Flag",hue='Injectable_Experience_During_Rx', data=dataf)

In [None]:
sns.countplot(x="Persistency_Flag", hue='Age_Bucket', data=dataf)

In [None]:
sns.countplot(x="Persistency_Flag", hue='Gender', data=dataf)

In [None]:
sns.countplot(x="Persistency_Flag", hue='Count_Of_Risks', data=dataf)

##Seperating input variables and target variables

In [None]:
X = dataf.drop(columns='Persistency_Flag')
y = dataf['Persistency_Flag']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
print(X.shape)
print(y.shape)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
target = le.fit_transform(np.ravel(y))

##Encoded target variable

In [None]:
target

##Target Class
Class "1" is Persistent\
Class "0" is Non-Persistent

##Label Encoder for Non-Numeric columns

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        #Transforms columns of X specified in self.columns using LabelEncoder().
        #If no columns specified, transforms all columns in X.
        
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
New_dataF = MultiColumnLabelEncoder(columns = ['Ptid', 'Gender', 'Race', 'Ethnicity', 'Region', 'Age_Bucket', 'Ntm_Speciality', 'Ntm_Specialist_Flag', 'Ntm_Speciality_Bucket', 'Gluco_Record_Prior_Ntm', 'Gluco_Record_During_Rx', 'Dexa_During_Rx', 'Frag_Frac_Prior_Ntm', 'Frag_Frac_During_Rx', 'Risk_Segment_Prior_Ntm', 'Tscore_Bucket_Prior_Ntm', 'Risk_Segment_During_Rx', 'Tscore_Bucket_During_Rx', 'Change_T_Score', 'Change_Risk_Segment', 'Adherent_Flag', 'Idn_Indicator', 'Injectable_Experience_During_Rx', 'Comorb_Encounter_For_Screening_For_Malignant_Neoplasms', 'Comorb_Encounter_For_Immunization', 'Comorb_Encntr_For_General_Exam_W_O_Complaint,_Susp_Or_Reprtd_Dx', 'Comorb_Vitamin_D_Deficiency', 'Comorb_Other_Joint_Disorder_Not_Elsewhere_Classified', 'Comorb_Encntr_For_Oth_Sp_Exam_W_O_Complaint_Suspected_Or_Reprtd_Dx', 'Comorb_Long_Term_Current_Drug_Therapy', 'Comorb_Dorsalgia', 'Comorb_Personal_History_Of_Other_Diseases_And_Conditions', 'Comorb_Other_Disorders_Of_Bone_Density_And_Structure', 'Comorb_Disorders_of_lipoprotein_metabolism_and_other_lipidemias', 'Comorb_Osteoporosis_without_current_pathological_fracture', 'Comorb_Personal_history_of_malignant_neoplasm', 'Comorb_Gastro_esophageal_reflux_disease', 'Concom_Cholesterol_And_Triglyceride_Regulating_Preparations', 'Concom_Narcotics', 'Concom_Systemic_Corticosteroids_Plain', 'Concom_Anti_Depressants_And_Mood_Stabilisers', 'Concom_Fluoroquinolones', 'Concom_Cephalosporins', 'Concom_Macrolides_And_Similar_Types', 'Concom_Broad_Spectrum_Penicillins', 'Concom_Anaesthetics_General', 'Concom_Viral_Vaccines', 'Risk_Type_1_Insulin_Dependent_Diabetes', 'Risk_Osteogenesis_Imperfecta', 'Risk_Rheumatoid_Arthritis', 'Risk_Untreated_Chronic_Hyperthyroidism', 'Risk_Untreated_Chronic_Hypogonadism', 'Risk_Untreated_Early_Menopause', 'Risk_Patient_Parent_Fractured_Their_Hip', 'Risk_Smoking_Tobacco', 'Risk_Chronic_Malnutrition_Or_Malabsorption', 'Risk_Chronic_Liver_Disease', 'Risk_Family_History_Of_Osteoporosis', 'Risk_Low_Calcium_Intake', 'Risk_Vitamin_D_Insufficiency', 'Risk_Poor_Health_Frailty', 'Risk_Excessive_Thinness', 'Risk_Hysterectomy_Oophorectomy', 'Risk_Estrogen_Deficiency', 'Risk_Immobilization', 'Risk_Recurring_Falls']).fit_transform(X)

##New dataframe after label encoding

In [None]:
New_dataF.head(10)

In [None]:
New_dataF.tail(10)

In [None]:
New_dataF['Ptid'].nunique()

##Splitting of data into Training and Testing dataset for ML Models

In [None]:
##Important packages importing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
train_X, test_X, train_y, test_y = train_test_split(New_dataF, target, test_size=0.3, random_state=42)

##Hypothesis Testing for Classification Models
###Steps:
The first step would be to to state the null hypothesis statement.\
H0: Both models have the same performance on the dataset.\
H1: Both models doesnâ€™t have the same performance on the dataset.\
Level of significance is 0.05

##Logistic Regression Model

In [None]:
# Logistic Regression
# fit a model
model = LogisticRegression(solver='newton-cg')#, max_iter=3000, penalty='elasticnet',l1_ratio=1)
model.fit(train_X, train_y)

In [None]:
# Accuracy on Testing Dataset using Logistic Regression
score_LR = model.score(test_X, test_y)
print("Using Logistic Regression Model- Accuracy on Test Dataset is", score_LR*100, "%")

##Support Vector Machine Model

In [None]:
#Import Library
from sklearn import svm
import pandas as pd
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Create Linear SVM object
support = svm.LinearSVC(random_state=20)

# Train the model using the training sets and check score on test dataset
support.fit(train_X, train_y)
predicted= support.predict(test_X)
score=accuracy_score(test_y,predicted)
print("Using Support Vector Machine Model- Accuracy on Test Dataset is", score*100, "%")

##Hyposthesis Results
It is clear that the accuracy of **Logistic Regression Model** is much better as compared to support vector machine on same dataset.

In [None]:
from mlxtend.evaluate import paired_ttest_5x2cv
# check if difference between algorithms is real
t, p = paired_ttest_5x2cv(estimator1=model, 
                          estimator2=support, 
                          X=New_dataF, 
                          y=target, 
                          scoring='accuracy', 
                          random_seed=1)
# summarize
print(f'The P-value is = {p:.3f}')
print(f'The t-statistics is = {t:.3f}')
# interpret the result
if p <= 0.05:
    print('Since p<0.05, We can reject the null-hypothesis that both models perform equally well on this dataset. \
    \nWe may conclude that the two algorithms are significantly different.')
else:
    print('Since p>0.05, we cannot reject the null hypothesis. \
    \nWe may conclude that the performance of the two algorithms is not significantly different.')

So, here our samples do not provide the enough evidence to conclude that the assume effect exists or does not exist.

Here, I choose **Logistic Regression** over svm to calculate the all required details.

##ROC Curve and Area Under Curve

In [None]:
# predict probabilities
yhat = model.predict_proba(test_X)

# retrieve just the probabilities for the positive class
pos_probs = yhat[:, 1]

# plot no skill roc curve
pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill')
# calculate roc curve for model
fpr, tpr, _ = roc_curve(test_y, pos_probs)
# plot model roc curve
pyplot.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
#Calculating ROC Area Under Curve
yhat = model.predict_proba(test_X)
pos_probs = yhat[:, 1]
roc_auc = roc_auc_score(test_y, pos_probs)
print('Logistic ROC Area Under Curve %.3f' % roc_auc)

##Coefficients of variables

In [None]:
print('Coefficients of all variables : ', model.coef_)

##Intercept of Model

In [None]:
print('Intercept of the model : ', model.intercept_)

##Predicted Classes

In [None]:
print('Predicted Classes are : ', model.classes_)

In [None]:
print('Predicted probability on training dataset : ', model.predict_proba(train_X))

In [None]:
#Accuracy on Training Dataset
print('Accuracy on Training Dataset : ', model.score(train_X, train_y))

##Confusion Matrix

In [None]:
confusion_matrix(target, model.predict(New_dataF))

1. True negatives in the upper-left position
2. False negatives in the lower-left position
3. False positives in the upper-right position
4. True positives in the lower-right position

In [None]:
cm = confusion_matrix(target, model.predict(New_dataF))
fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted Non-Persistent', 'Predicted Persistent'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual Non-Persistent', 'Actual Persistent'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
plt.show()

##Classification Report on Confusion Matrix

In [None]:
print(classification_report(target, model.predict(New_dataF)))

##Target Class
Where:\
Class "1" is Persistent\
Class "0" is Non-Persistent

**If you like this notebook then please upvote the notebook. It will motivate me to work more and more.**