### November 2, 2020 Feature Selection

#### Multi-class classification using:
- Logistic Regression (One vs Rest)
- Random Forest Classifier

y: Health Canada CRC code

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score

In [2]:
df_full = pd.read_csv('fda_matched_new.csv')
print(df_full.shape)
df_full.head(3)

(7916, 13)


Unnamed: 0,FDA_Chapter,FDA_Section,FDA_Code,FDA_Subpart,FDA_Description,fda_desc_cleaned,matched,score,HC_Code,HC_Chapter,HC_Section,HC_Subpart,hc_desc_cleaned
0,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.1""",General.,(a) The provisions of regulations promulgated ...,provision regulation promulgated federal food ...,1.0,0.772684,B.01.008.1,Foods,General,General,information appearing label prepackaged produc...
1,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.3""",Definitions.,"(a) Labeling includes all written, printed,...",labeling includes written printed graphic matt...,1.0,0.635591,C.01.031,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,subject section c person shall sell drug descr...
2,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.4""",Authority citations.,"(a) For each part of its regulations, the Food...",part regulation food drug administration inclu...,1.0,0.779126,A.01.062,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,subject section food drug packaged container d...


In [3]:
df_full['matched'].value_counts()

1.0    7705
0.0     102
Name: matched, dtype: int64

In [4]:
# Matched regulations dataset
df_full = df_full.loc[df_full['matched'] == 1.0]
print(df_full.shape)
df_full.head(3)

(7705, 13)


Unnamed: 0,FDA_Chapter,FDA_Section,FDA_Code,FDA_Subpart,FDA_Description,fda_desc_cleaned,matched,score,HC_Code,HC_Chapter,HC_Section,HC_Subpart,hc_desc_cleaned
0,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.1""",General.,(a) The provisions of regulations promulgated ...,provision regulation promulgated federal food ...,1.0,0.772684,B.01.008.1,Foods,General,General,information appearing label prepackaged produc...
1,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.3""",Definitions.,"(a) Labeling includes all written, printed,...",labeling includes written printed graphic matt...,1.0,0.635591,C.01.031,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,subject section c person shall sell drug descr...
2,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.4""",Authority citations.,"(a) For each part of its regulations, the Food...",part regulation food drug administration inclu...,1.0,0.779126,A.01.062,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,subject section food drug packaged container d...


In [5]:
reference_df = pd.DataFrame(sorted(list(zip(df_full['HC_Code'].unique()))), columns = ['HC_Code'])

print(reference_df.shape)
reference_df.head(3)

(717, 1)


Unnamed: 0,HC_Code
0,A.01.001
1,A.01.002
2,A.01.010


In [6]:
reference_df['y'] = (reference_df.index).astype(str)
print(reference_df.shape)
reference_df.head(3)

(717, 2)


Unnamed: 0,HC_Code,y
0,A.01.001,0
1,A.01.002,1
2,A.01.010,2


In [7]:
df_full = df_full.merge(reference_df, on='HC_Code', how='left')
print(df_full.shape)
df_full.head(3)

(7705, 14)


Unnamed: 0,FDA_Chapter,FDA_Section,FDA_Code,FDA_Subpart,FDA_Description,fda_desc_cleaned,matched,score,HC_Code,HC_Chapter,HC_Section,HC_Subpart,hc_desc_cleaned,y
0,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.1""",General.,(a) The provisions of regulations promulgated ...,provision regulation promulgated federal food ...,1.0,0.772684,B.01.008.1,Foods,General,General,information appearing label prepackaged produc...,29
1,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.3""",Definitions.,"(a) Labeling includes all written, printed,...",labeling includes written printed graphic matt...,1.0,0.635591,C.01.031,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,subject section c person shall sell drug descr...,356
2,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.4""",Authority citations.,"(a) For each part of its regulations, the Food...",part regulation food drug administration inclu...,1.0,0.779126,A.01.062,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,subject section food drug packaged container d...,17


### 1. X: 'HC_Chapter','HC_Section','HC_Subpart'

In [8]:
# define df and drop duplicates
df = df_full[['HC_Chapter','HC_Section','HC_Subpart','y']]
print('Original shape:',df.shape)
df.drop_duplicates(subset = df.columns, keep='last',inplace =True)
features = df.columns[:-1]
df = df.reset_index(drop=True)
print('Shape after removing duplicates:',df.shape)
df.head(3)

Original shape: (7705, 4)
Shape after removing duplicates: (717, 4)


Unnamed: 0,HC_Chapter,HC_Section,HC_Subpart,y
0,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,356
1,Controlled Drugs,Licensed Dealers,Import Permits,633
2,Restricted Drugs,Test Kits,Not Available,681


In [9]:
# define X and y
y = df[['y']]

X = df[['HC_Chapter','HC_Section','HC_Subpart']]
X = pd.get_dummies(data = X, columns = X.columns)

print(X.shape)
print(y.shape)

(717, 247)
(717, 1)


In [10]:
# run classifier
print('Features:',features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('---------- Logistic Regression ----------')
lr = LogisticRegression(multi_class='ovr')
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)

f1_score = metrics.f1_score(y_test, lr_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, lr_predictions)
print('Cohen Kappa Score:',ck)

print('---------- Random Forest Classifier ----------')
rf = RandomForestClassifier(n_estimators = 100, random_state=42).fit(X_train, y_train) 
rf_predictions = rf.predict(X_test) 

f1_score = metrics.f1_score(y_test, rf_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, rf_predictions)
print('Cohen Kappa Score:',ck)

Features: Index(['HC_Chapter', 'HC_Section', 'HC_Subpart'], dtype='object')
---------- Logistic Regression ----------
F1 Score: 0.0
Cohen Kappa Score: 0.0
---------- Random Forest Classifier ----------
F1 Score: 0.0
Cohen Kappa Score: 0.0


### 2. X: 'HC_Chapter','HC_Section','HC_Subpart', 'hc_desc_cleaned'

In [11]:
# define df and drop duplicates
df = df_full[['HC_Chapter','HC_Section','HC_Subpart','hc_desc_cleaned','y']]
print('Original shape:',df.shape)
df.drop_duplicates(subset=df.columns, keep='last',inplace =True)
features = df.columns[:-1]
df = df.reset_index(drop=True)
print('Shape after removing duplicates:',df.shape)
df.head(3)

Original shape: (7705, 5)
Shape after removing duplicates: (718, 5)


Unnamed: 0,HC_Chapter,HC_Section,HC_Subpart,hc_desc_cleaned,y
0,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,subject section c person shall sell drug descr...,356
1,Controlled Drugs,Licensed Dealers,Import Permits,providing copy permit holder import permit mus...,633
2,Restricted Drugs,Test Kits,Not Available,application registration number manufacturer t...,681


In [12]:
# define X and y
y = df[['y']]
X = df[['HC_Chapter','HC_Section','HC_Subpart']]
X = pd.get_dummies(data = X, columns = X.columns)

tfidf_vectorizer = TfidfVectorizer()
X_text = df[['hc_desc_cleaned']]
X_text['hc_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['hc_desc_cleaned']).toarray()
X = pd.concat([X, X_text], axis=1, sort=True)

print(X.shape)
print(y.shape)

(718, 248)
(718, 1)


In [13]:
# run classifier
print('Features:',features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('---------- Logistic Regression ----------')
lr = LogisticRegression(multi_class='ovr')
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)

f1_score = metrics.f1_score(y_test, lr_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, lr_predictions)
print('Cohen Kappa Score:',ck)

print('---------- Random Forest Classifier ----------')
rf = RandomForestClassifier(n_estimators = 100, random_state=42).fit(X_train, y_train) 
rf_predictions = rf.predict(X_test) 

f1_score = metrics.f1_score(y_test, rf_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, rf_predictions)
print('Cohen Kappa Score:',ck)

Features: Index(['HC_Chapter', 'HC_Section', 'HC_Subpart', 'hc_desc_cleaned'], dtype='object')
---------- Logistic Regression ----------
F1 Score: 0.0
Cohen Kappa Score: 0.0
---------- Random Forest Classifier ----------
F1 Score: 0.0
Cohen Kappa Score: 0.0


### 3. X: 'HC_Chapter','HC_Section','HC_Subpart', 'fda_desc_cleaned'

In [14]:
# define df and drop duplicates
df = df_full[['HC_Chapter','HC_Section','HC_Subpart','fda_desc_cleaned','y']]
print('Original shape:',df.shape)
df.drop_duplicates(subset=df.columns, keep='last',inplace =True)
features = df.columns[:-1]
df = df.reset_index(drop=True)
print('Shape after removing duplicates:',df.shape)
df.head(3)

Original shape: (7705, 5)
Shape after removing duplicates: (7705, 5)


Unnamed: 0,HC_Chapter,HC_Section,HC_Subpart,fda_desc_cleaned,y
0,Foods,General,General,provision regulation promulgated federal food ...,29
1,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,labeling includes written printed graphic matt...,356
2,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,part regulation food drug administration inclu...,17


In [15]:
# define X and y
y = df[['y']]
X = df[['HC_Chapter','HC_Section','HC_Subpart']]
X = pd.get_dummies(data = X, columns = X.columns)

tfidf_vectorizer = TfidfVectorizer()
X_text = df[['fda_desc_cleaned']]
X_text['fda_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['fda_desc_cleaned']).toarray()
X = pd.concat([X, X_text], axis=1, sort=True)

print(X.shape)
print(y.shape)



(7705, 248)
(7705, 1)


In [16]:
# run classifier
print('Features:',features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('---------- Logistic Regression ----------')
lr = LogisticRegression(multi_class='ovr')
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)

f1_score = metrics.f1_score(y_test, lr_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, lr_predictions)
print('Cohen Kappa Score:',ck)

print('---------- Random Forest Classifier ----------')
rf = RandomForestClassifier(n_estimators = 100, random_state=42).fit(X_train, y_train) 
rf_predictions = rf.predict(X_test) 

f1_score = metrics.f1_score(y_test, rf_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, rf_predictions)
print('Cohen Kappa Score:',ck)

Features: Index(['HC_Chapter', 'HC_Section', 'HC_Subpart', 'fda_desc_cleaned'], dtype='object')
---------- Logistic Regression ----------
F1 Score: 0.5944192083062946
Cohen Kappa Score: 0.5892240541010847
---------- Random Forest Classifier ----------
F1 Score: 0.6132381570408826
Cohen Kappa Score: 0.6083420220512197


### 4. X: 'HC_Chapter','HC_Section','HC_Subpart','hc_desc_cleaned', 'fda_desc_cleaned'

In [17]:
# define df and drop duplicates
df = df_full[['HC_Chapter','HC_Section','HC_Subpart','hc_desc_cleaned','fda_desc_cleaned','y']]
print('Original shape:',df.shape)
df.drop_duplicates(subset=df.columns, keep='last',inplace =True)
features = df.columns[:-1]
df = df.reset_index(drop=True)
print('Shape after removing duplicates:',df.shape)
df.head(3)

Original shape: (7705, 6)
Shape after removing duplicates: (7705, 6)


Unnamed: 0,HC_Chapter,HC_Section,HC_Subpart,hc_desc_cleaned,fda_desc_cleaned,y
0,Foods,General,General,information appearing label prepackaged produc...,provision regulation promulgated federal food ...,29
1,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,subject section c person shall sell drug descr...,labeling includes written printed graphic matt...,356
2,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,subject section food drug packaged container d...,part regulation food drug administration inclu...,17


In [18]:
# define X and y
y = df[['y']]
X = df[['HC_Chapter','HC_Section','HC_Subpart']]
X = pd.get_dummies(data = X, columns = X.columns)

tfidf_vectorizer = TfidfVectorizer()
X_text = df[['hc_desc_cleaned','fda_desc_cleaned']]
X_text['hc_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['hc_desc_cleaned']).toarray()
X_text['fda_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['fda_desc_cleaned']).toarray()
X = pd.concat([X, X_text], axis=1, sort=True)

print(X.shape)
print(y.shape)



(7705, 249)
(7705, 1)


In [19]:
# run classifier
print('Features:',features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('---------- Logistic Regression ----------')
lr = LogisticRegression(multi_class='ovr')
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)

f1_score = metrics.f1_score(y_test, lr_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, lr_predictions)
print('Cohen Kappa Score:',ck)

print('---------- Random Forest Classifier ----------')
rf = RandomForestClassifier(n_estimators = 100, random_state=42).fit(X_train, y_train) 
rf_predictions = rf.predict(X_test) 

f1_score = metrics.f1_score(y_test, rf_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, rf_predictions)
print('Cohen Kappa Score:',ck)

Features: Index(['HC_Chapter', 'HC_Section', 'HC_Subpart', 'hc_desc_cleaned',
       'fda_desc_cleaned'],
      dtype='object')
---------- Logistic Regression ----------
F1 Score: 0.5944192083062946
Cohen Kappa Score: 0.5892235285085803
---------- Random Forest Classifier ----------
F1 Score: 0.6346528228423102
Cohen Kappa Score: 0.6300746346297178


### 5. X: 'HC_Chapter','HC_Section','HC_Subpart','hc_desc_cleaned', 'fda_desc_cleaned','FDA_Chapter'

In [20]:
# define df and drop duplicates
df = df_full[['FDA_Chapter','HC_Chapter','HC_Section','HC_Subpart','hc_desc_cleaned','fda_desc_cleaned','y']]
print('Original shape:',df.shape)
df.drop_duplicates(subset=df.columns, keep='last',inplace =True)
features = df.columns[:-1]
df = df.reset_index(drop=True)
print('Shape after removing duplicates:',df.shape)
df.head(3)

Original shape: (7705, 7)
Shape after removing duplicates: (7705, 7)


Unnamed: 0,FDA_Chapter,HC_Chapter,HC_Section,HC_Subpart,hc_desc_cleaned,fda_desc_cleaned,y
0,GENERAL ENFORCEMENT REGULATIONS,Foods,General,General,information appearing label prepackaged produc...,provision regulation promulgated federal food ...,29
1,GENERAL ENFORCEMENT REGULATIONS,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,subject section c person shall sell drug descr...,labeling includes written printed graphic matt...,356
2,GENERAL ENFORCEMENT REGULATIONS,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,subject section food drug packaged container d...,part regulation food drug administration inclu...,17


In [21]:
# define X and y
y = df[['y']]
X = df[['FDA_Chapter','HC_Chapter','HC_Section','HC_Subpart']]
X = pd.get_dummies(data = X, columns = X.columns)

tfidf_vectorizer = TfidfVectorizer()
X_text = df[['hc_desc_cleaned','fda_desc_cleaned']]
X_text['hc_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['hc_desc_cleaned']).toarray()
X_text['fda_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['fda_desc_cleaned']).toarray()
X = pd.concat([X, X_text], axis=1, sort=True)

print(X.shape)
print(y.shape)

(7705, 459)
(7705, 1)


In [22]:
# run classifier
print('Features:',features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('---------- Logistic Regression ----------')
lr = LogisticRegression(multi_class='ovr')
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)

f1_score = metrics.f1_score(y_test, lr_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, lr_predictions)
print('Cohen Kappa Score:',ck)

print('---------- Random Forest Classifier ----------')
rf = RandomForestClassifier(n_estimators = 100, random_state=42).fit(X_train, y_train) 
rf_predictions = rf.predict(X_test) 

f1_score = metrics.f1_score(y_test, rf_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, rf_predictions)
print('Cohen Kappa Score:',ck)

Features: Index(['FDA_Chapter', 'HC_Chapter', 'HC_Section', 'HC_Subpart',
       'hc_desc_cleaned', 'fda_desc_cleaned'],
      dtype='object')
---------- Logistic Regression ----------
F1 Score: 0.6541207008436081
Cohen Kappa Score: 0.649859769637686
---------- Random Forest Classifier ----------
F1 Score: 0.6768332251784556
Cohen Kappa Score: 0.6731453970547147


### 6. X: 'HC_Chapter','HC_Section','HC_Subpart','hc_desc_cleaned', 'fda_desc_cleaned','FDA_Section'

In [23]:
# define df and drop duplicates
df = df_full[['FDA_Section','HC_Chapter','HC_Section','HC_Subpart',
              'hc_desc_cleaned','fda_desc_cleaned','y']]
print('Original shape:',df.shape)
df.drop_duplicates(subset=df.columns, keep='last',inplace =True)
features = df.columns[:-1]
df = df.reset_index(drop=True)
print('Shape after removing duplicates:',df.shape)
df.head(3)

Original shape: (7705, 7)
Shape after removing duplicates: (7705, 7)


Unnamed: 0,FDA_Section,HC_Chapter,HC_Section,HC_Subpart,hc_desc_cleaned,fda_desc_cleaned,y
0,General Provisions,Foods,General,General,information appearing label prepackaged produc...,provision regulation promulgated federal food ...,29
1,General Provisions,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,subject section c person shall sell drug descr...,labeling includes written printed graphic matt...,356
2,General Provisions,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,subject section food drug packaged container d...,part regulation food drug administration inclu...,17


In [24]:
# define X and y
y = df[['y']]
X = df[['FDA_Section','HC_Chapter','HC_Section','HC_Subpart']]
X = pd.get_dummies(data = X, columns = X.columns)

tfidf_vectorizer = TfidfVectorizer()
X_text = df[['hc_desc_cleaned','fda_desc_cleaned']]
X_text['hc_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['hc_desc_cleaned']).toarray()
X_text['fda_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['fda_desc_cleaned']).toarray()
X = pd.concat([X, X_text], axis=1, sort=True)

print(X.shape)
print(y.shape)

(7705, 1547)
(7705, 1)


In [25]:
# run classifier
print('Features:',features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('---------- Logistic Regression ----------')
lr = LogisticRegression(multi_class='ovr')
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)

f1_score = metrics.f1_score(y_test, lr_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, lr_predictions)
print('Cohen Kappa Score:',ck)

print('---------- Random Forest Classifier ----------')
rf = RandomForestClassifier(n_estimators = 100, random_state=42).fit(X_train, y_train) 
rf_predictions = rf.predict(X_test) 

f1_score = metrics.f1_score(y_test, rf_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, rf_predictions)
print('Cohen Kappa Score:',ck)

Features: Index(['FDA_Section', 'HC_Chapter', 'HC_Section', 'HC_Subpart',
       'hc_desc_cleaned', 'fda_desc_cleaned'],
      dtype='object')
---------- Logistic Regression ----------
F1 Score: 0.6391953277092797
Cohen Kappa Score: 0.6346866788353069
---------- Random Forest Classifier ----------
F1 Score: 0.6761842959117457
Cohen Kappa Score: 0.6724110680692822


### 7. X: 'HC_Chapter','HC_Section','HC_Subpart','hc_desc_cleaned', 'fda_desc_cleaned','FDA_Subpart'

In [26]:
# define df and drop duplicates
df = df_full[['FDA_Subpart','HC_Chapter','HC_Section','HC_Subpart',
              'hc_desc_cleaned','fda_desc_cleaned','y']]
print('Original shape:',df.shape)
df.drop_duplicates(subset=df.columns, keep='last',inplace =True)
features = df.columns[:-1]
df = df.reset_index(drop=True)
print('Shape after removing duplicates:',df.shape)
df.head(3)

Original shape: (7705, 7)
Shape after removing duplicates: (7705, 7)


Unnamed: 0,FDA_Subpart,HC_Chapter,HC_Section,HC_Subpart,hc_desc_cleaned,fda_desc_cleaned,y
0,General.,Foods,General,General,information appearing label prepackaged produc...,provision regulation promulgated federal food ...,29
1,Definitions.,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,subject section c person shall sell drug descr...,labeling includes written printed graphic matt...,356
2,Authority citations.,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,subject section food drug packaged container d...,part regulation food drug administration inclu...,17


In [27]:
# define X and y
y = df[['y']]
X = df[['FDA_Subpart','HC_Chapter','HC_Section','HC_Subpart']]
X = pd.get_dummies(data = X, columns = X.columns)

tfidf_vectorizer = TfidfVectorizer()
X_text = df[['hc_desc_cleaned','fda_desc_cleaned']]
X_text['hc_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['hc_desc_cleaned']).toarray()
X_text['fda_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['fda_desc_cleaned']).toarray()
X = pd.concat([X, X_text], axis=1, sort=True)

print(X.shape)
print(y.shape)


(7705, 6828)
(7705, 1)


In [28]:
# run classifier
print('Features:',features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('---------- Logistic Regression ----------')
lr = LogisticRegression(multi_class='ovr')
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)

f1_score = metrics.f1_score(y_test, lr_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, lr_predictions)
print('Cohen Kappa Score:',ck)

print('---------- Random Forest Classifier ----------')
rf = RandomForestClassifier(n_estimators = 100, random_state=42).fit(X_train, y_train) 
rf_predictions = rf.predict(X_test) 

f1_score = metrics.f1_score(y_test, rf_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, rf_predictions)
print('Cohen Kappa Score:',ck)

Features: Index(['FDA_Subpart', 'HC_Chapter', 'HC_Section', 'HC_Subpart',
       'hc_desc_cleaned', 'fda_desc_cleaned'],
      dtype='object')
---------- Logistic Regression ----------
F1 Score: 0.5983127839065542
Cohen Kappa Score: 0.5931813839387767
---------- Random Forest Classifier ----------
F1 Score: 0.6359506813757301
Cohen Kappa Score: 0.631532982981063


### 8. X: 'HC_Chapter','HC_Section','HC_Subpart','hc_desc_cleaned', 'fda_desc_cleaned','FDA_Chapter', 'FDA_Section'

In [29]:
# define df and drop duplicates
df = df_full[['FDA_Chapter','FDA_Section','HC_Chapter','HC_Section','HC_Subpart',
              'hc_desc_cleaned','fda_desc_cleaned','y']]
print('Original shape:',df.shape)
df.drop_duplicates(subset=df.columns, keep='last',inplace =True)
features = df.columns[:-1]
df = df.reset_index(drop=True)
print('Shape after removing duplicates:',df.shape)
df.head(3)

Original shape: (7705, 8)
Shape after removing duplicates: (7705, 8)


Unnamed: 0,FDA_Chapter,FDA_Section,HC_Chapter,HC_Section,HC_Subpart,hc_desc_cleaned,fda_desc_cleaned,y
0,GENERAL ENFORCEMENT REGULATIONS,General Provisions,Foods,General,General,information appearing label prepackaged produc...,provision regulation promulgated federal food ...,29
1,GENERAL ENFORCEMENT REGULATIONS,General Provisions,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,subject section c person shall sell drug descr...,labeling includes written printed graphic matt...,356
2,GENERAL ENFORCEMENT REGULATIONS,General Provisions,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,subject section food drug packaged container d...,part regulation food drug administration inclu...,17


In [30]:
# define X and y
y = df[['y']]
X = df[['FDA_Chapter','FDA_Section','HC_Chapter','HC_Section','HC_Subpart']]
X = pd.get_dummies(data = X, columns = X.columns)

tfidf_vectorizer = TfidfVectorizer()
X_text = df[['hc_desc_cleaned','fda_desc_cleaned']]
X_text['hc_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['hc_desc_cleaned']).toarray()
X_text['fda_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['fda_desc_cleaned']).toarray()
X = pd.concat([X, X_text], axis=1, sort=True)

print(X.shape)
print(y.shape)


(7705, 1757)
(7705, 1)


In [31]:
# run classifier
print('Features:',features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('---------- Logistic Regression ----------')
lr = LogisticRegression(multi_class='ovr')
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)

f1_score = metrics.f1_score(y_test, lr_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, lr_predictions)
print('Cohen Kappa Score:',ck)

print('---------- Random Forest Classifier ----------')
rf = RandomForestClassifier(n_estimators = 100, random_state=42).fit(X_train, y_train) 
rf_predictions = rf.predict(X_test) 

f1_score = metrics.f1_score(y_test, rf_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, rf_predictions)
print('Cohen Kappa Score:',ck)

Features: Index(['FDA_Chapter', 'FDA_Section', 'HC_Chapter', 'HC_Section', 'HC_Subpart',
       'hc_desc_cleaned', 'fda_desc_cleaned'],
      dtype='object')
---------- Logistic Regression ----------
F1 Score: 0.6625567813108372
Cohen Kappa Score: 0.6584491343587419
---------- Random Forest Classifier ----------
F1 Score: 0.6807268007787152
Cohen Kappa Score: 0.6770740516734672


### 9. X: 'HC_Chapter','HC_Section','HC_Subpart','hc_desc_cleaned', 'fda_desc_cleaned','FDA_Chapter', 'FDA_Subpart'

In [32]:
# define df and drop duplicates
df = df_full[['FDA_Chapter','FDA_Subpart','HC_Chapter','HC_Section','HC_Subpart',
              'hc_desc_cleaned','fda_desc_cleaned','y']]
print('Original shape:',df.shape)
df.drop_duplicates(subset=df.columns, keep='last',inplace =True)
features = df.columns[:-1]
df = df.reset_index(drop=True)
print('Shape after removing duplicates:',df.shape)
df.head(3)

Original shape: (7705, 8)
Shape after removing duplicates: (7705, 8)


Unnamed: 0,FDA_Chapter,FDA_Subpart,HC_Chapter,HC_Section,HC_Subpart,hc_desc_cleaned,fda_desc_cleaned,y
0,GENERAL ENFORCEMENT REGULATIONS,General.,Foods,General,General,information appearing label prepackaged produc...,provision regulation promulgated federal food ...,29
1,GENERAL ENFORCEMENT REGULATIONS,Definitions.,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,subject section c person shall sell drug descr...,labeling includes written printed graphic matt...,356
2,GENERAL ENFORCEMENT REGULATIONS,Authority citations.,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,subject section food drug packaged container d...,part regulation food drug administration inclu...,17


In [33]:
# define X and y
y = df[['y']]
X = df[['FDA_Chapter','FDA_Subpart','HC_Chapter','HC_Section','HC_Subpart']]
X = pd.get_dummies(data = X, columns = X.columns)

tfidf_vectorizer = TfidfVectorizer()
X_text = df[['hc_desc_cleaned','fda_desc_cleaned']]
X_text['hc_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['hc_desc_cleaned']).toarray()
X_text['fda_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['fda_desc_cleaned']).toarray()
X = pd.concat([X, X_text], axis=1, sort=True)

print(X.shape)
print(y.shape)

(7705, 7038)
(7705, 1)


In [34]:
# run classifier
print('Features:',features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('---------- Logistic Regression ----------')
lr = LogisticRegression(multi_class='ovr')
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)

f1_score = metrics.f1_score(y_test, lr_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, lr_predictions)
print('Cohen Kappa Score:',ck)

print('---------- Random Forest Classifier ----------')
rf = RandomForestClassifier(n_estimators = 100, random_state=42).fit(X_train, y_train) 
rf_predictions = rf.predict(X_test) 

f1_score = metrics.f1_score(y_test, rf_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, rf_predictions)
print('Cohen Kappa Score:',ck)

Features: Index(['FDA_Chapter', 'FDA_Subpart', 'HC_Chapter', 'HC_Section', 'HC_Subpart',
       'hc_desc_cleaned', 'fda_desc_cleaned'],
      dtype='object')
---------- Logistic Regression ----------
F1 Score: 0.6534717715768981
Cohen Kappa Score: 0.6491742809491164
---------- Random Forest Classifier ----------
F1 Score: 0.682024659312135
Cohen Kappa Score: 0.678298928202749


### 10. X: 'HC_Chapter','HC_Section','HC_Subpart','hc_desc_cleaned', 'fda_desc_cleaned','FDA_Section', 'FDA_Subpart'

In [35]:
# define df and drop duplicates
df = df_full[['FDA_Section','FDA_Subpart','HC_Chapter','HC_Section','HC_Subpart',
              'hc_desc_cleaned','fda_desc_cleaned','y']]
print('Original shape:',df.shape)
df.drop_duplicates(subset=df.columns, keep='last',inplace =True)
features = df.columns[:-1]
df = df.reset_index(drop=True)
print('Shape after removing duplicates:',df.shape)
df.head(3)

Original shape: (7705, 8)
Shape after removing duplicates: (7705, 8)


Unnamed: 0,FDA_Section,FDA_Subpart,HC_Chapter,HC_Section,HC_Subpart,hc_desc_cleaned,fda_desc_cleaned,y
0,General Provisions,General.,Foods,General,General,information appearing label prepackaged produc...,provision regulation promulgated federal food ...,29
1,General Provisions,Definitions.,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,subject section c person shall sell drug descr...,labeling includes written printed graphic matt...,356
2,General Provisions,Authority citations.,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,subject section food drug packaged container d...,part regulation food drug administration inclu...,17


In [36]:
# define X and y
y = df[['y']]
X = df[['FDA_Section','FDA_Subpart','HC_Chapter','HC_Section','HC_Subpart']]
X = pd.get_dummies(data = X, columns = X.columns)

tfidf_vectorizer = TfidfVectorizer()
X_text = df[['hc_desc_cleaned','fda_desc_cleaned']]
X_text['hc_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['hc_desc_cleaned']).toarray()
X_text['fda_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['fda_desc_cleaned']).toarray()
X = pd.concat([X, X_text], axis=1, sort=True)

print(X.shape)
print(y.shape)

(7705, 8126)
(7705, 1)


In [37]:
# run classifier
print('Features:',features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('---------- Logistic Regression ----------')
lr = LogisticRegression(multi_class='ovr')
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)

f1_score = metrics.f1_score(y_test, lr_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, lr_predictions)
print('Cohen Kappa Score:',ck)

print('---------- Random Forest Classifier ----------')
rf = RandomForestClassifier(n_estimators = 100, random_state=42).fit(X_train, y_train) 
rf_predictions = rf.predict(X_test) 

f1_score = metrics.f1_score(y_test, rf_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, rf_predictions)
print('Cohen Kappa Score:',ck)

Features: Index(['FDA_Section', 'FDA_Subpart', 'HC_Chapter', 'HC_Section', 'HC_Subpart',
       'hc_desc_cleaned', 'fda_desc_cleaned'],
      dtype='object')
---------- Logistic Regression ----------
F1 Score: 0.6424399740428294
Cohen Kappa Score: 0.6379723336471423
---------- Random Forest Classifier ----------
F1 Score: 0.6768332251784556
Cohen Kappa Score: 0.6729931672770737


### 11. X: 'HC_Chapter','HC_Section','HC_Subpart','hc_desc_cleaned', 'fda_desc_cleaned','FDA_Chapter','FDA_Section', 'FDA_Subpart'

In [38]:
# define df and drop duplicates
df = df_full[['FDA_Chapter','FDA_Section','FDA_Subpart','HC_Chapter','HC_Section','HC_Subpart',
              'hc_desc_cleaned','fda_desc_cleaned','y']]
print('Original shape:',df.shape)
df.drop_duplicates(subset=df.columns, keep='last',inplace =True)
features = df.columns[:-1]
df = df.reset_index(drop=True)
print('Shape after removing duplicates:',df.shape)
df.head(3)

Original shape: (7705, 9)
Shape after removing duplicates: (7705, 9)


Unnamed: 0,FDA_Chapter,FDA_Section,FDA_Subpart,HC_Chapter,HC_Section,HC_Subpart,hc_desc_cleaned,fda_desc_cleaned,y
0,GENERAL ENFORCEMENT REGULATIONS,General Provisions,General.,Foods,General,General,information appearing label prepackaged produc...,provision regulation promulgated federal food ...,29
1,GENERAL ENFORCEMENT REGULATIONS,General Provisions,Definitions.,Drugs,Cautionary Statements and Child Resistant Pack...,Cautionary Statements and Child Resistant Pack...,subject section c person shall sell drug descr...,labeling includes written printed graphic matt...,356
2,GENERAL ENFORCEMENT REGULATIONS,General Provisions,Authority citations.,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,subject section food drug packaged container d...,part regulation food drug administration inclu...,17


In [39]:
# define X and y
y = df[['y']]
X = df[['FDA_Chapter','FDA_Section','FDA_Subpart','HC_Chapter','HC_Section','HC_Subpart']]
X = pd.get_dummies(data = X, columns = X.columns)

tfidf_vectorizer = TfidfVectorizer()
X_text = df[['hc_desc_cleaned','fda_desc_cleaned']]
X_text['hc_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['hc_desc_cleaned']).toarray()
X_text['fda_desc_cleaned'] = tfidf_vectorizer.fit_transform(X_text['fda_desc_cleaned']).toarray()
X = pd.concat([X, X_text], axis=1, sort=True)

print(X.shape)
print(y.shape)

(7705, 8336)
(7705, 1)


In [40]:
# run classifier
print('Features:',features)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print('---------- Logistic Regression ----------')
lr = LogisticRegression(multi_class='ovr')
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)

f1_score = metrics.f1_score(y_test, lr_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, lr_predictions)
print('Cohen Kappa Score:',ck)

print('---------- Random Forest Classifier ----------')
rf = RandomForestClassifier(n_estimators = 100, random_state=42).fit(X_train, y_train) 
rf_predictions = rf.predict(X_test) 

f1_score = metrics.f1_score(y_test, rf_predictions, average = 'micro') 
print('F1 Score:',f1_score)

ck = cohen_kappa_score(y_test, rf_predictions)
print('Cohen Kappa Score:',ck)

Features: Index(['FDA_Chapter', 'FDA_Section', 'FDA_Subpart', 'HC_Chapter', 'HC_Section',
       'HC_Subpart', 'hc_desc_cleaned', 'fda_desc_cleaned'],
      dtype='object')
---------- Logistic Regression ----------
F1 Score: 0.6645035691109669
Cohen Kappa Score: 0.6604077509334879
---------- Random Forest Classifier ----------
F1 Score: 0.6872160934458144
Cohen Kappa Score: 0.6835282693845657
