# To Predict Depression Among Patients Admitted to ICU 

## Part I: Data Preprocessing

### Import packages

In [1]:
# Adjust notebook settings to widen the notebook
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:95% !important;}</style>"))

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

### Import modules/datasets

In [3]:
# every unique hospitalization for each patient in the database (defines HADM_ID_
admissions = pd.read_csv('/Users/chiufengyap/OneDrive - The University of Texas Health Science Center at Houston/Research/MIMIC/mimic-iii-clinical-database-1.4/ADMISSIONS.csv')
# Diagnosis Related Groups (DRG), which are used by the hospital for billing purposes.
drgcodes = pd.read_csv("/Users/chiufengyap/OneDrive - The University of Texas Health Science Center at Houston/Research/MIMIC/mimic-iii-clinical-database-1.4/DRGCODES.csv")
# Deidentified notes, including nursing and physician notes, ECG reports, imaging reports, and discharge summaries.
noteevents = pd.read_csv("/Users/chiufengyap/OneDrive - The University of Texas Health Science Center at Houston/Research/MIMIC/mimic-iii-clinical-database-1.4/NOTEEVENTS.csv")
# every unique patient in the database (defines subject_id)
patients = pd.read_csv("/Users/chiufengyap/OneDrive - The University of Texas Health Science Center at Houston/Research/MIMIC/mimic-iii-clinical-database-1.4/PATIENTS.csv")
# the clinical service under which a patient is registered
services = pd.read_csv("/Users/chiufengyap/OneDrive - The University of Texas Health Science Center at Houston/Research/MIMIC/mimic-iii-clinical-database-1.4/SERVICES.csv")
# Medications ordered, and not necessarily administered, for a given patient
prescriptions = pd.read_csv("/Users/chiufengyap/OneDrive - The University of Texas Health Science Center at Houston/Research/MIMIC/mimic-iii-clinical-database-1.4/PRESCRIPTIONS.csv")
# Ground truth dataset
phenotypes = pd.read_csv("/Users/chiufengyap/OneDrive - The University of Texas Health Science Center at Houston/Research/MIMIC/phenotype-annotations-for-patient-notes-in-the-mimic-iii-database-1.20.03/ACTdb102003.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
# lowercase all strings inside of a dataframe to lowercase
admissions = admissions.apply(lambda x: x.astype(str).str.lower())
drgcodes = drgcodes.apply(lambda x: x.astype(str).str.lower())
noteevents = noteevents.apply(lambda x: x.astype(str).str.lower())
patients = patients.apply(lambda x: x.astype(str).str.lower())
services = services.apply(lambda x: x.astype(str).str.lower())
prescriptions = prescriptions.apply(lambda x: x.astype(str).str.lower())
phenotypes = phenotypes.apply(lambda x: x.astype(str).str.lower())

# lowercase columns in all dataframes
admissions.columns = admissions.columns.str.lower()
drgcodes.columns = drgcodes.columns.str.lower()
noteevents.columns = noteevents.columns.str.lower()
patients.columns = patients.columns.str.lower()
services.columns = services.columns.str.lower()
prescriptions.columns = prescriptions.columns.str.lower()
phenotypes.columns = phenotypes.columns.str.lower()

In [5]:
# decrease the datasets by subsetting the records which ID is in phenotypes dataset
admissions_reduced = admissions[admissions['subject_id'].isin(phenotypes['subject_id'])]
drgcodes_reduced = drgcodes[drgcodes['subject_id'].isin(phenotypes['subject_id'])]
noteevents_reduced = noteevents[noteevents['subject_id'].isin(phenotypes['subject_id'])]
patients_reduced = patients[patients['subject_id'].isin(phenotypes['subject_id'])]
services_reduced = services[services['subject_id'].isin(phenotypes['subject_id'])]
prescriptions_reduced = prescriptions[prescriptions['subject_id'].isin(phenotypes['subject_id'])]


admissions_reduced = admissions_reduced.reset_index(drop=True)
drgcodes_reduced = drgcodes_reduced.reset_index(drop=True)
noteevents_reduced = noteevents_reduced.reset_index(drop=True)
patients_reduced = patients_reduced.reset_index(drop=True)
services_reduced = services_reduced.reset_index(drop=True)
prescriptions_reduced = prescriptions_reduced.reset_index(drop=True)

### Functions

In [6]:
# function to get unique values
def unique(list1):
    x = np.array(list1)
    print(np.unique(x))

### Clean dataset: phenotypes

In [7]:
# Only keep the interested outcome feature
phenotypes_reduced = phenotypes[['hadm_id','subject_id','depression']]
# Drop duplicated records by subject_id and hadm_id
phenotypes_reduced = phenotypes_reduced.drop_duplicates(subset=['subject_id','hadm_id'], ignore_index = True)

In [8]:
# Size of phenotypes_reduced
phenotypes_reduced.shape

(813, 3)

In [9]:
phenotypes_reduced.head()

Unnamed: 0,hadm_id,subject_id,depression
0,100103,3365,0
1,100137,27290,0
2,100473,5525,0
3,100485,41515,1
4,100548,2265,0


### Clean dataset: admissions

#### Regarding diagnosis feature from admission: 

15,693 distinct diagnoses for 58,976 admissions. The diagnoses can be very informative (e.g. chronic kidney failure) or quite vague (e.g. weakness). Final diagnoses for a patient’s hospital stay are coded on discharge and can be found in the DIAGNOSES_ICD table. While this field can provide information about the status of a patient on hospital admission, it is not recommended to use it to stratify patients.


In [10]:
# Size of admissions_reduced
admissions_reduced.shape

(1944, 19)

In [11]:
# Since the dates and times from the database are deidentified -- create new features to get the time difference
admissions_reduced['edouttime'] = pd.to_datetime(admissions_reduced['edouttime'])
admissions_reduced['edregtime'] = pd.to_datetime(admissions_reduced['edregtime'])
admissions_reduced['length_ed'] = (admissions_reduced['edouttime'] - admissions_reduced['edregtime']).dt.days
admissions_reduced['dischtime'] = pd.to_datetime(admissions_reduced['dischtime'])
admissions_reduced['admittime'] = pd.to_datetime(admissions_reduced['admittime'])
admissions_reduced['length_admit'] = (admissions_reduced['dischtime'] - admissions_reduced['admittime']).dt.days
# Drop time-related features used to create new features
admissions_reduced = admissions_reduced.drop(['edregtime', 'edouttime', 'dischtime', 'admittime'], axis = 1)

In [12]:
# Create aggregate, dummy, and new variables for admission df to create one row per id
just_dummies = pd.get_dummies(admissions_reduced['admission_type'], prefix='admission_type')
admissions_reduced = pd.concat([admissions_reduced, just_dummies], axis=1)
# Create dummy variables for admission location
just_dummies = pd.get_dummies(admissions_reduced['admission_location'], prefix='admission_loc')
admissions_reduced = pd.concat([admissions_reduced, just_dummies], axis=1)
# Create dummy variables for discharge location
just_dummies = pd.get_dummies(admissions_reduced['discharge_location'], prefix='discharge_loc')
admissions_reduced = pd.concat([admissions_reduced, just_dummies], axis=1)
# Create dummy variables for insurance
just_dummies = pd.get_dummies(admissions_reduced['insurance'], prefix='insurance')
admissions_reduced = pd.concat([admissions_reduced, just_dummies], axis=1)
# Create dummy variables for religions
just_dummies = pd.get_dummies(admissions_reduced['religion'], prefix='religion')
admissions_reduced = pd.concat([admissions_reduced, just_dummies], axis=1)
# Create dummy variables for language
just_dummies = pd.get_dummies(admissions_reduced['language'], prefix='language')
admissions_reduced = pd.concat([admissions_reduced, just_dummies], axis=1)
# Create dummy variables for marital_status
just_dummies = pd.get_dummies(admissions_reduced['marital_status'], prefix='marital_status')
admissions_reduced = pd.concat([admissions_reduced, just_dummies], axis=1)
# Create dummy variables for ethnicity
just_dummies = pd.get_dummies(admissions_reduced['ethnicity'], prefix='ethnicity')
admissions_reduced = pd.concat([admissions_reduced, just_dummies], axis=1)

In [13]:
# Remove features used to dummy variables
admissions_reduced = admissions_reduced.drop(['row_id', 'deathtime', 'diagnosis', 'religion', 'language','marital_status', 'ethnicity', 'insurance', 'admission_location', 'discharge_location', 'admission_type'], axis = 1)

In [14]:
admissions_reduced.head(2)

Unnamed: 0,subject_id,hadm_id,hospital_expire_flag,has_chartevents_data,length_ed,length_admit,admission_type_elective,admission_type_emergency,admission_type_urgent,admission_loc_clinic referral/premature,admission_loc_emergency room admit,admission_loc_phys referral/normal deli,admission_loc_transfer from hosp/extram,admission_loc_transfer from other healt,admission_loc_transfer from skilled nur,discharge_loc_dead/expired,discharge_loc_disc-tran cancer/chldrn h,discharge_loc_disch-tran to psych hosp,discharge_loc_home,discharge_loc_home health care,discharge_loc_home with home iv providr,discharge_loc_hospice-home,discharge_loc_hospice-medical facility,discharge_loc_icf,discharge_loc_left against medical advi,discharge_loc_long term care hospital,discharge_loc_other facility,discharge_loc_rehab/distinct part hosp,discharge_loc_short term hospital,discharge_loc_snf,insurance_government,insurance_medicaid,insurance_medicare,insurance_private,insurance_self pay,religion_baptist,religion_buddhist,religion_catholic,religion_christian scientist,religion_episcopalian,religion_greek orthodox,religion_hebrew,religion_jehovah's witness,religion_jewish,religion_muslim,religion_nan,religion_not specified,religion_other,religion_protestant quaker,religion_romanian east. orth,religion_unitarian-universalist,religion_unobtainable,language_*chi,language_*hun,language_*man,language_arab,language_camb,language_cant,language_cape,language_engl,language_fren,language_gree,language_hait,language_ital,language_nan,language_pers,language_port,language_ptun,language_russ,language_span,language_urdu,marital_status_divorced,marital_status_life partner,marital_status_married,marital_status_nan,marital_status_separated,marital_status_single,marital_status_unknown (default),marital_status_widowed,ethnicity_asian,ethnicity_asian - asian indian,ethnicity_asian - chinese,ethnicity_black/african,ethnicity_black/african american,ethnicity_black/cape verdean,ethnicity_black/haitian,ethnicity_hispanic or latino,ethnicity_hispanic/latino - guatemalan,ethnicity_hispanic/latino - puerto rican,ethnicity_other,ethnicity_patient declined to answer,ethnicity_portuguese,ethnicity_unable to obtain,ethnicity_unknown/not specified,ethnicity_white,ethnicity_white - brazilian,ethnicity_white - eastern european,ethnicity_white - russian
0,368,105889,0,1,0.0,4,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,368,138061,0,1,1.0,5,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [15]:
admissions_reduced["hospital_expire_flag"] = admissions_reduced.hospital_expire_flag.astype(float)
admissions_reduced["has_chartevents_data"] = admissions_reduced.has_chartevents_data.astype(float)

In [16]:
# Size of cleaned admissions_reduced dataset
admissions_reduced.shape

(1944, 98)

### Clean dataset: patients

In [None]:
# Size of patients_reduced dataset
patients_reduced.shape

In [None]:
# Create numerical code for string variables in the gender feature
patients_reduced.gender[patients_reduced.gender == 'm'] = 1
patients_reduced.gender[patients_reduced.gender == 'f'] = 0

In [None]:
# Removing PIH features that had been deidentified
patients_reduced = patients_reduced.drop(['row_id', 'dob', 'dod', 'dod_hosp', 'dod_ssn'], axis = 1)

In [None]:
main["expire_flag"] = main.expire_flag.astype(float)
main['gender'] = main.gender.astype(int)

In [None]:
# Final features left for patients_reduced dataset
patients_reduced.head(2)

### Cleaning dataset:  drgcodes

In [None]:
drgcodes_reduced.shape

In [None]:
drgcodes_reduced.head(3)

In [None]:
drgcodes_reduced.drg_severity.unique()

In [None]:
drgcodes_reduced.drg_mortality.unique()

In [None]:
# Create dummy variables for drg_code
just_dummies = pd.get_dummies(drgcodes_reduced['drg_code'], prefix='drg_code')
drgcodes_reduced = pd.concat([drgcodes_reduced, just_dummies], axis=1)

# Create dummy variables for drg_code
just_dummies = pd.get_dummies(drgcodes_reduced['drg_type'], prefix='drg_type')
drgcodes_reduced = pd.concat([drgcodes_reduced, just_dummies], axis=1)

In [None]:
# Transform object to numerical features
drgcodes_reduced['drg_mortality'] = pd.to_numeric(drgcodes_reduced.drg_mortality, errors='coerce').fillna(0, downcast='infer').astype('Int32')
drgcodes_reduced['drg_severity'] = pd.to_numeric(drgcodes_reduced.drg_severity, errors='coerce').fillna(0, downcast='infer').astype('Int32')

In [None]:
# In order to have one record for each unique combination of subject_id and hadm_id, mean of the drg_mortality and drg_severity
# are calculated
drgcodes_reduced['avg_drg_mortality'] = drgcodes_reduced.groupby(['subject_id', 'hadm_id']).drg_mortality.transform('mean')
drgcodes_reduced['avg_drg_severity'] = drgcodes_reduced.groupby(['subject_id', 'hadm_id']).drg_severity.transform('mean')

In [None]:
drgcodes_reduced['avg_drg_mortality'] = drgcodes_reduced.avg_drg_mortality.astype(float)
drgcodes_reduced['avg_drg_severity'] = drgcodes_reduced.avg_drg_severity.astype(float)

In [None]:
# Drop duplicates by comparing subject_id and hadm_id
drgcodes_reduced = drgcodes_reduced.drop_duplicates(subset=['subject_id','hadm_id'], ignore_index = True)

In [None]:
drgcodes_reduced = drgcodes_reduced.drop(['row_id', 'description', 'drg_code', 'drg_type', 'drg_severity', 'drg_mortality' ], axis = 1)

In [None]:
# Final size of the drgcodes_reduced
drgcodes_reduced.shape

### Merge Datasets

In [None]:
phenotypes_reduced.shape

In [None]:
main = pd.merge(admissions_reduced, phenotypes_reduced,
                how ='right',
                on = ['subject_id', 'hadm_id'])

main = pd.merge(main, patients_reduced,
                how ='left',
                on = ['subject_id'])

main = pd.merge(main, drgcodes_reduced,
                how ='left',
                on = ['subject_id', 'hadm_id'])

In [None]:
main.shape

### Missingness of the final merged datasets

In [None]:
main.isnull().mean() # length_ed had ~30% missingness
main['length_ed'] = main['length_ed'].fillna(0)

### Restructuring -- Move the outcome variable to be the last column in the dataset

In [None]:
cols = list(main.columns.values)
cols.pop(cols.index('depression'))
main = main[cols+['depression']]

In [None]:
main["hospital_expire_flag"] = main.hospital_expire_flag.astype(float)
main["has_chartevents_data"] = main.has_chartevents_data.astype(float)
main["expire_flag"] = main.expire_flag.astype(float)
main['gender'] = main.gender.astype(int)
main['avg_drg_mortality'] = main.avg_drg_mortality.astype(float)
main['avg_drg_severity'] = main.avg_drg_severity.astype(float)

#### Set X as features and y as the outcome

In [None]:
main = main.drop(['subject_id', 'hadm_id'], axis = 1)

In [None]:
X = main.iloc[:, :-1].values
y = main.iloc[:, -1].values

### Encoding the Independent Variable

In [None]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# X = np.array(ct.fit_transform(X))

### Encoding the Dependent Variable

In [None]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# y = le.fit_transform(y)

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

## Feature Scaling

Feature scaling is essential for machine learning algorithms that calculate distances between data. Therefore, the range of all features should be normalized so that each feature contributes approximately proportionately to the final distance.

* Normalization is recommended when you have a normally distributed observations.
* Standardization works all the time. (recommended)
* We need to perform Feature Scaling when we are dealing with Gradient Descent Based algorithms (Linear and Logistic Regression, Neural Network) and Distance-based algorithms (KNN, K-means, SVM) as these are very sensitive to the range of the data points.

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Dimensionality Reduction

### Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

##### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

##### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2) # for Euclidean distance
classifier.fit(X_train, y_train)

In [None]:
# predicting the test set results
y_pred = classifier.predict(X_test)

In [None]:
# making the confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#### Support Vector Machine

Assumption: observations are linear

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#### Kernel SVM

Maaping to a higher dimensional space can be highly compute-intensive.

Types of Kernal Functions:
* Gaussian RBF Kernel
* Sigmoid Kernel
* Polynomial Kernel
* and etc.

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#### Naive Bayes

Advantages: This algorithm requires a small amount of training data to estimate the necessary parameters. Naive Bayes classifiers are extremely fast compared to more sophisticated methods.

Disadvantages: Naive Bayes is is known to be a bad estimator.

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#### Decision Tree

Advantages: Decision Tree is simple to understand and visualise, requires little data preparation, and can handle both numerical and categorical data.

Disadvantages: Decision tree can create complex trees that do not generalise well, and decision trees can be unstable because small variations in the data might result in a completely different tree being generated.

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#### Random Forest

Ensemble Learning : using different machine algorithms 

The algorithm does not work well for datasets having a lot of outliers, something which needs addressing prior to the model building.

Advantages: Reduction in over-fitting and random forest classifier is more accurate than decision trees in most cases.

Disadvantages: Slow real time prediction, difficult to implement, and complex algorithm.

Build on top of Decision Trees

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

### Linear Discriminant Analysis (LDA)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components = 1)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
# predicting the test set results
y_pred = classifier.predict(X_test)

In [None]:
# Making the confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
# Applying k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)

In [None]:
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))

In [None]:
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [None]:
# Applying Grid Search to find the best model and the best parameters


#### K Nearest Neighbor (KNN)

K-Nearest Neighbor (KNN) algorithm predicts based on the specified number (k) of the nearest neighboring data points. Here, the pre-processing of the data is significant as it impacts the distance measurements directly. Unlike others, the model does not have a mathematical formula, neither any descriptive ability.

It is a simple, fairly accurate model preferable mostly for smaller datasets, owing to huge computations involved on the continuous predictors.

Step 1: Choose the number of K of neighbors

Step 2: Take the K nearest neighbors of the new data point, according to the Euclidean distance

Step 3: Among these K neighbors, count the number of data points in each category

Step 4: Assign the new data point ot the category where you counted the most neighbors

Then, the model will be ready. 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2) # for Euclidean distance
classifier.fit(X_train, y_train)

In [None]:
# predicting the test set results
y_pred = classifier.predict(X_test)

In [None]:
# making the confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#### Support Vector Machine

Assumption: observations are linear

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#### Kernel SVM

Maaping to a higher dimensional space can be highly compute-intensive.

Types of Kernal Functions:
* Gaussian RBF Kernel
* Sigmoid Kernel
* Polynomial Kernel
* and etc.

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#### Naive Bayes

Advantages: This algorithm requires a small amount of training data to estimate the necessary parameters. Naive Bayes classifiers are extremely fast compared to more sophisticated methods.

Disadvantages: Naive Bayes is is known to be a bad estimator.

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#### Decision Tree

Advantages: Decision Tree is simple to understand and visualise, requires little data preparation, and can handle both numerical and categorical data.

Disadvantages: Decision tree can create complex trees that do not generalise well, and decision trees can be unstable because small variations in the data might result in a completely different tree being generated.

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#### Random Forest

Ensemble Learning : using different machine algorithms 

The algorithm does not work well for datasets having a lot of outliers, something which needs addressing prior to the model building.

Advantages: Reduction in over-fitting and random forest classifier is more accurate than decision trees in most cases.

Disadvantages: Slow real time prediction, difficult to implement, and complex algorithm.

Build on top of Decision Trees

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

### Kernel PCA

In [None]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components = 2, kernel = 'rbf')
X_train = kpca.fit_transform(X_train)
X_test = kpca.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

## Model Section

### k-fold Cross Validation

### Grid Search 