# Find Relationship Between Duration of Drug Exposure, Age and Likelihood of both Heart Disease and Methadone reliance

### Data Preprocessing

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [21]:
data = pd.read_csv("HeartProcedurePatientsDrugs.csv")
data.head()

Unnamed: 0,drug_concept_id,person_id,concept_name,drug_era_start_date,drug_era_end_date,Duration_of_exposure,Age_at_procedure,Used_Methadone,Older_Than_50,Extended_Use
0,959665,1707195,Pyrethrins,9/9/2010,10/9/2010,30,62,0,1,1
1,1316354,601185,Bendroflumethiazide,3/10/2009,4/9/2009,30,80,0,1,1
2,40161692,1139042,black walnut pollen extract,3/17/2009,3/27/2009,10,82,0,1,0
3,40175669,1173478,Stachybotrys chartarum allergenic extract,6/13/2009,7/13/2009,30,69,0,1,1
4,40175801,1609782,barley allergenic extract,3/1/2009,3/31/2009,30,26,0,0,1


In [22]:
data = data[["Age_at_procedure","Duration_of_exposure","Used_Methadone"]]
data.head(20)

Unnamed: 0,Age_at_procedure,Duration_of_exposure,Used_Methadone
0,62,30,0
1,80,30,0
2,82,10,0
3,69,30,0
4,26,30,0
5,70,10,1
6,79,10,1
7,80,10,1
8,79,30,1
9,76,30,0


In [23]:
data.shape

(200, 3)

In [24]:
data.describe()

Unnamed: 0,Age_at_procedure,Duration_of_exposure,Used_Methadone
count,200.0,200.0,200.0
mean,71.56,28.96,0.38
std,14.413366,21.834885,0.486604
min,26.0,1.0,0.0
25%,67.0,10.0,0.0
50%,74.0,30.0,0.0
75%,81.0,30.0,1.0
max,99.0,90.0,1.0


In [25]:
data.isnull().values.any()

False

In [26]:
#Separating the data and labels
X = data.drop(columns="Used_Methadone",axis=1)
Y = data["Used_Methadone"]

In [27]:
print(X)

     Age_at_procedure  Duration_of_exposure
0                  62                    30
1                  80                    30
2                  82                    10
3                  69                    30
4                  26                    30
..                ...                   ...
195                85                    30
196                46                    10
197                88                    30
198                46                    90
199                88                    30

[200 rows x 2 columns]


In [28]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
195    0
196    0
197    0
198    0
199    0
Name: Used_Methadone, Length: 200, dtype: int64


In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=1)

In [30]:
print(X.shape, X_train.shape, X_test.shape)

(200, 2) (160, 2) (40, 2)


### Model Training

In [31]:
model= LogisticRegression()

In [32]:
#training logistic regression model with training data
model.fit(X_train, Y_train)

LogisticRegression()

### Model Evaluation

In [33]:
#accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [34]:
print("Accuracy on training data : ", training_data_accuracy)

Accuracy on training data :  0.71875


In [35]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [36]:
print("Accuracy on test data : ", test_data_accuracy)

Accuracy on test data :  0.725


In [37]:
#Accuracy could be improved with more samples

### Making a Predictive System

In [40]:
def prediction(age,duration_of_exposure):
    input_data = (age,duration_of_exposure) #Age is 50 and duration of drug exposure is 30, did individual use Methadone?

    input_data_as_numpy_array = np.asarray(input_data)
    input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

    prediction = model.predict(input_data_reshaped)
    return prediction #Warning may appear due to header

In [46]:
print(prediction(70,10))

[1]


