In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

In [2]:
import os
os.chdir('C://Users//MathurS1/Desktop/Breath_Test/')

In [4]:
input_and_output = pd.read_csv("input_and_output.csv")
input_and_output.columns

Index(['Unnamed: 0', 'age', 'BMI', 'Diagnosis', 'group.Postive H2',
       'group.Positive CH4', 'group.Normal BT', 'group.Positive h2 & Ch4',
       'group.Flatliner', 'group.Equivocal',
       ...
       'ICD.252', 'ICD.413', 'ICD.151', 'ICD.410', 'ICD.696', 'ICD.634',
       'ICD.695', 'ICD.284', 'ICD.222', 'ICD.694'],
      dtype='object', length=109)

In [5]:
input_and_output = input_and_output.drop('Unnamed: 0', 1)


In [6]:
X = input_and_output.drop('Diagnosis',1)

In [7]:
y = input_and_output['Diagnosis']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)


In [21]:
from sklearn.ensemble import RandomForestClassifier
feat_labels = X.columns[0:]

#defining our hyperparameters
random_forest = RandomForestClassifier(n_estimators=600, random_state=42, n_jobs=-1)

#fit 
random_forest.fit(X_train,y_train)

#getting the important features and labels
importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f+1,25,feat_labels[f],importances[indices[f]]))


 1) age                       0.269846
 2) BMI                       0.138996
 3) group.Postive H2          0.131695
 4) group.Positive CH4        0.126950
 5) group.Normal BT           0.093633
 6) group.Positive h2 & Ch4   0.042277
 7) group.Flatliner           0.013116
 8) group.Equivocal           0.012724
 9) gender.F                  0.012606
10) gender.M                  0.012017
11) race.White                0.011855
12) race.Black or African American 0.008732
13) race.Asian                0.008722
14) race.Other                0.005769
15) dinf.2                    0.005692
16) dinf.0                    0.005167
17) dinf.1                    0.005047
18) dinf.3                    0.004896
19) ICD.174                   0.004829
20) ICD.V10                   0.004671
21) ICD.244                   0.004588
22) ICD.562                   0.004547
23) ICD.401                   0.004422
24) ICD.724                   0.004083
25) ICD.579                   0.003676
26) ICD.285         

In [10]:
svc_model = LinearSVC(random_state=42,multi_class='crammer_singer')

In [11]:
svc_model.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='crammer_singer', penalty='l2', random_state=42,
     tol=0.0001, verbose=0)

In [12]:
predictions = svc_model.predict(X_test)

In [13]:
from sklearn.metrics import confusion_matrix,classification_report

In [14]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.99      1.00      0.99      3184
          1       0.84      0.73      0.78        67
          2       0.74      0.55      0.63        47

avg / total       0.98      0.99      0.98      3298



In [15]:
print(accuracy_score(y_test,predictions))

0.9857489387507581


In [22]:
r_predictions = random_forest.predict(X_test)

In [23]:
print(classification_report(y_test,r_predictions))

             precision    recall  f1-score   support

          0       0.99      1.00      0.99      3184
          1       0.80      0.66      0.72        67
          2       0.69      0.43      0.53        47

avg / total       0.98      0.98      0.98      3298



In [18]:
print(accuracy_score(y_test,r_predictions))

0.9824135839902971


In [20]:
#logisitic regression
pipe_lr = Pipeline([('clf', LogisticRegression(random_state=42,multi_class='multinomial'))])

#linear SVC
pipe_svm = Pipeline([(('clf', LinearSVC(random_state=42,multi_class='crammer_singer')))])

pipe_rf = Pipeline([('clf', RandomForestClassifier(random_state=42,n_estimators=300))])



#GRID_SEARCH_PARAMETERS

param_range = [1, 2, 3, 4, 5]
param_range_fl = [1.0, 0.5, 0.1]

grid_params_lr = [{'clf__C': param_range_fl,
        'clf__solver': ['newton-cg']}] 

grid_params_rf = [{'clf__criterion': ['gini'],
        'clf__min_samples_leaf': param_range,
        'clf__max_depth': param_range,
        'clf__min_samples_split': param_range[1:]}]

grid_params_svc = [{'clf__C': param_range_fl}]


# Construct grid searches
jobs = -1

gs_lr = GridSearchCV(estimator = pipe_lr,param_grid=grid_params_lr,scoring='accuracy',cv=5) 

gs_rf = GridSearchCV(estimator = pipe_rf,param_grid=grid_params_rf,scoring='accuracy',cv=5, n_jobs=jobs)

gs_svc = GridSearchCV(estimator = pipe_svm,param_grid=grid_params_svc,scoring='accuracy',cv=5, n_jobs=jobs)

#creating a list of all this for ease of interation

grids = [gs_lr,gs_rf,gs_svc]

# Dictionary of pipelines and classifier types for ease of reference
grid_dict = {0: 'Logistic Regression', 1: 'Random Forest', 2: 'Support Vector Machine'}




# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    # Fit grid search	
    gs.fit(X_train, y_train)
    # Best params
    print('Best params: %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    print(classification_report(y_test,y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

Performing model optimizations...

Estimator: Logistic Regression
Best params: {'clf__C': 1.0, 'clf__solver': 'newton-cg'}
Best training accuracy: 0.980
Test set accuracy score for best params: 0.985 
             precision    recall  f1-score   support

          0       0.99      1.00      0.99      3184
          1       0.86      0.72      0.78        67
          2       0.75      0.45      0.56        47

avg / total       0.98      0.98      0.98      3298


Estimator: Random Forest
Best params: {'clf__criterion': 'gini', 'clf__max_depth': 5, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2}
Best training accuracy: 0.952
Test set accuracy score for best params: 0.965 
             precision    recall  f1-score   support

          0       0.97      1.00      0.98      3184
          1       0.00      0.00      0.00        67
          2       0.00      0.00      0.00        47

avg / total       0.93      0.97      0.95      3298


Estimator: Support Vector Machine


  'precision', 'predicted', average, warn_for)


Best params: {'clf__C': 0.5}
Best training accuracy: 0.980
Test set accuracy score for best params: 0.986 
             precision    recall  f1-score   support

          0       0.99      1.00      0.99      3184
          1       0.85      0.75      0.79        67
          2       0.74      0.55      0.63        47

avg / total       0.98      0.99      0.99      3298


Classifier with best test set accuracy: Support Vector Machine


In [35]:
from sklearn.neighbors import KNeighborsClassifier
## Instantiate the model with 5 neighbors. 
knn = KNeighborsClassifier(n_neighbors=3)
## Fit the model on the training data.
knn.fit(X_train, y_train)
## See how the model performs on the test data.

knn_predictions = knn.predict(X_test)
print(classification_report(y_test,knn_predictions))

             precision    recall  f1-score   support

          0       0.97      1.00      0.98      3184
          1       0.33      0.01      0.03        67
          2       0.40      0.04      0.08        47

avg / total       0.95      0.96      0.95      3298



In [38]:
CD_and_UC = pd.read_csv("CD_UC_ML.csv")

In [40]:
CD_and_UC = CD_and_UC.drop('Unnamed: 0',1)

In [85]:
X = CD_and_UC.drop('Diagnosis',1)

y = CD_and_UC['Diagnosis']
X = X.drop('X',1)

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [49]:
from sklearn.svm import SVC
svc_model = SVC(random_state=42)
svc_model.fit(X_train,y_train)
predictions = svc_model.predict(X_test)

In [50]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          1       0.63      0.98      0.77        92
          2       0.33      0.02      0.04        54

avg / total       0.52      0.62      0.50       146



In [60]:
param_grid = {'C':[0.1,1,10,100,1000],'gamma':[10,1,0.1,0.01,0.001],'kernel':['linear','rbf']}
grid = GridSearchCV(SVC(),param_grid,verbose=2)
grid.fit(X_train,y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] C=0.1, gamma=10, kernel=linear ..................................
[CV] ................... C=0.1, gamma=10, kernel=linear, total=  26.4s
[CV] C=0.1, gamma=10, kernel=linear ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.4s remaining:    0.0s


[CV] ................... C=0.1, gamma=10, kernel=linear, total=  31.1s
[CV] C=0.1, gamma=10, kernel=linear ..................................
[CV] ................... C=0.1, gamma=10, kernel=linear, total=  36.8s
[CV] C=0.1, gamma=10, kernel=rbf .....................................
[CV] ...................... C=0.1, gamma=10, kernel=rbf, total=   0.0s
[CV] C=0.1, gamma=10, kernel=rbf .....................................
[CV] ...................... C=0.1, gamma=10, kernel=rbf, total=   0.0s
[CV] C=0.1, gamma=10, kernel=rbf .....................................
[CV] ...................... C=0.1, gamma=10, kernel=rbf, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .................... C=0.1, gamma=1, kernel=linear, total=  26.5s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .................... C=0.1, gamma=1, kernel=linear, total=  31.3s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .

[CV] .................... C=10, gamma=10, kernel=linear, total=  53.0s
[CV] C=10, gamma=10, kernel=linear ...................................
[CV] .................... C=10, gamma=10, kernel=linear, total=  52.5s
[CV] C=10, gamma=10, kernel=linear ...................................
[CV] .................... C=10, gamma=10, kernel=linear, total=  46.8s
[CV] C=10, gamma=10, kernel=rbf ......................................
[CV] ....................... C=10, gamma=10, kernel=rbf, total=   0.0s
[CV] C=10, gamma=10, kernel=rbf ......................................
[CV] ....................... C=10, gamma=10, kernel=rbf, total=   0.0s
[CV] C=10, gamma=10, kernel=rbf ......................................
[CV] ....................... C=10, gamma=10, kernel=rbf, total=   0.0s
[CV] C=10, gamma=1, kernel=linear ....................................
[CV] ..................... C=10, gamma=1, kernel=linear, total=  52.4s
[CV] C=10, gamma=1, kernel=linear ....................................
[CV] .

[CV] .................. C=1000, gamma=10, kernel=linear, total= 2.3min
[CV] C=1000, gamma=10, kernel=linear .................................
[CV] .................. C=1000, gamma=10, kernel=linear, total=  44.0s
[CV] C=1000, gamma=10, kernel=linear .................................
[CV] .................. C=1000, gamma=10, kernel=linear, total=  50.3s
[CV] C=1000, gamma=10, kernel=rbf ....................................
[CV] ..................... C=1000, gamma=10, kernel=rbf, total=   0.0s
[CV] C=1000, gamma=10, kernel=rbf ....................................
[CV] ..................... C=1000, gamma=10, kernel=rbf, total=   0.0s
[CV] C=1000, gamma=10, kernel=rbf ....................................
[CV] ..................... C=1000, gamma=10, kernel=rbf, total=   0.0s
[CV] C=1000, gamma=1, kernel=linear ..................................
[CV] ................... C=1000, gamma=1, kernel=linear, total= 2.2min
[CV] C=1000, gamma=1, kernel=linear ..................................
[CV] .

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 76.1min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'gamma': [10, 1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [61]:
grid.best_params_

{'C': 0.1, 'gamma': 10, 'kernel': 'linear'}

In [64]:
grid_perdictions = grid.predict(X_test)

In [65]:
print(classification_report(y_test,grid_perdictions))

             precision    recall  f1-score   support

          1       0.77      0.93      0.85        92
          2       0.83      0.54      0.65        54

avg / total       0.79      0.79      0.77       146



In [105]:
#SVC


svc_model = SVC(C=0.1,gamma=10,kernel='linear')
svc_model.fit(X_train,y_train)
svc_predictions = svc_model.predict(X_test)
print(classification_report(y_test,svc_predictions))
print(accuracy_score(y_test,svc_predictions))


             precision    recall  f1-score   support

          1       0.86      0.96      0.91        96
          2       0.90      0.70      0.79        50

avg / total       0.87      0.87      0.87       146

0.8698630136986302


In [139]:
##KNN
training_accuracy =[]
testing_accuracy=[]
neighbours_setting = range(1,11)

# for n_neigbors in neighbours_setting:
#     knn = KNeighborsClassifier(n_neighbors=n_neigbors)
#     knn.fit(X_train,y_train)
#     testing_accuracy.append(knn.score(X_train,y_train))
    
# plt.plot(neighbours_setting,testing_accuracy,label = "testing accuracy")
# plt.xlabel("n-neighbours")
# plt.ylabel("Accuracy")

# plt.legend()
# plt.savefig('knn')


knn = KNeighborsClassifier(n_neighbors=3)
# ## Fit the model on the training data.
knn.fit(X_train, y_train)
# ## See how the model performs on the test data.

# knn_predictions = knn.predict(X_test)
print(classification_report(y_test,knn_predictions))
print(accuracy_score(y_test,knn_predictions))

             precision    recall  f1-score   support

          1       0.70      0.93      0.80        96
          2       0.63      0.24      0.35        50

avg / total       0.68      0.69      0.64       146

0.6917808219178082


In [102]:
# from sklearn.feature_extraction.text import CountVectorizer

# def f_importances(coef, names):
#     imp = coef
#     imp,names = zip(*sorted(zip(imp,names)))
#     plt.barh(range(len(names)), imp, align='center')
#     plt.yticks(range(len(names)), names)
#     plt.show()

# features_names = ['input1', 'input2']
# svc_model = SVC(C=0.1,gamma=10,kernel='linear')
# svc_model.fit(X_train,y_train)
# f_importances(svc_model.coef_, features_names)


#LOGISTIC REGRESSION

logres_model = LogisticRegression(C=0.1,penalty='l1')
logres_model.fit(X_train,y_train)
logres_predictions = logres_model.predict(X_test)
print(classification_report(y_test,logres_predictions))
print(accuracy_score(y_test,logres_predictions))


             precision    recall  f1-score   support

          1       0.86      0.96      0.91        96
          2       0.90      0.70      0.79        50

avg / total       0.87      0.87      0.87       146

0.8698630136986302


In [130]:
#RANDOM FOREST
# X.columns[0:]

from sklearn.ensemble import RandomForestClassifier
feat_labels = X.columns[0:]

#defining our hyperparameters
random_forest = RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=-1)

#fit 
random_forest.fit(X_train,y_train)

#getting the important features and labels
importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f+1,25,feat_labels[f],importances[indices[f]]))


 1) age                       0.260683
 2) BMI                       0.146877
 3) group.Postive.H2          0.125955
 4) group.Positive.CH4        0.025777
 5) group.Normal.BT           0.024735
 6) group.Positive.h2...Ch4   0.020540
 7) group.Flatliner           0.020186
 8) group.Equivocal           0.019348
 9) gender.F                  0.017765
10) gender.M                  0.017424
11) race.White                0.017399
12) race.Black.or.African.American 0.016069
13) race.Asian                0.015631
14) race.Other                0.013149
15) dinf.2                    0.012763
16) dinf.0                    0.012075
17) dinf.1                    0.011793
18) dinf.3                    0.011540
19) ICD.174                   0.011279
20) ICD.V10                   0.010865
21) ICD.244                   0.010816
22) ICD.562                   0.010136
23) ICD.401                   0.009171
24) ICD.724                   0.008608
25) ICD.579                   0.008427
26) ICD.285         

In [131]:
rf_predctions = random_forest.predict(X_test)
print(classification_report(y_test,rf_predctions))
print(accuracy_score(y_test,rf_predctions))


             precision    recall  f1-score   support

          1       0.86      0.91      0.88        96
          2       0.80      0.72      0.76        50

avg / total       0.84      0.84      0.84       146

0.8424657534246576


In [134]:
CD_and_UC = pd.read_csv("CD_UC_ML.csv")

In [145]:
X = CD_and_UC.drop(['Diagnosis','Unnamed: 0'],1)
y = CD_and_UC["Diagnosis"]


from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Create logistic regression
logistic = LogisticRegression()

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

# Create grid search using 5-fold cross validation
clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

# Fit grid search
best_model = clf.fit(X_train,y_train)

# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])


logres_model = LogisticRegression(C=1.0,penalty='l1')
logres_model.fit(X_train,y_train)

logres_model_predictions = logres_model.predict(X_test)
print(classification_report(y_test,logres_model_predictions))
print(accuracy_score(y_test,logres_model_predictions))


Best Penalty: l1
Best C: 1.0
             precision    recall  f1-score   support

          0       0.35      0.14      0.20        44
          1       0.71      0.89      0.79       102

avg / total       0.60      0.66      0.61       146

0.6643835616438356


In [152]:
from sklearn.svm import SVC

svc_model = SVC(C=1,gamma=10,kernel='linear')
svc_model.fit(X_train,y_train)
svc_predictions = svc_model.predict(X_test)
print(classification_report(y_test,svc_predictions))
print(accuracy_score(y_test,svc_predictions))


             precision    recall  f1-score   support

          0       0.33      0.02      0.04        44
          1       0.70      0.98      0.82       102

avg / total       0.59      0.69      0.58       146

0.6917808219178082


In [72]:
print(logres_model.coef_)

[[-0.00619939  0.0259974   0.01277971  0.          0.          0.11571751
  -0.00744877  0.          0.          0.01471577  0.         -0.73124159
   0.          0.42640501  0.61876364]]


In [153]:
#RANDOM FOREST
# X.columns[0:]

from sklearn.ensemble import RandomForestClassifier
feat_labels = X.columns[0:]

#defining our hyperparameters
random_forest = RandomForestClassifier(n_estimators=1000, random_state=42, n_jobs=-1)

#fit 
random_forest.fit(X_train,y_train)

#getting the important features and labels
importances = random_forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f+1,25,feat_labels[f],importances[indices[f]]))

 1) age                       0.524300
 2) BMI                       0.314677
 3) group.Normal BT           0.018938
 4) group.Positive h2 & Ch4   0.017570
 5) group.Postive H2          0.017206
 6) group.Flatliner           0.014414
 7) group.Positive CH4        0.014383
 8) group.Equivocal           0.014008
 9) gender.F                  0.013054
10) gender.M                  0.012257
11) race.White                0.010288
12) race.Asian                0.009652
13) race.Other                0.007703
14) race.Unknown              0.006288
15) race.Black or African American 0.005262


In [154]:
rf_predctions = random_forest.predict(X_test)
print(classification_report(y_test,rf_predctions))
print(accuracy_score(y_test,rf_predctions))

             precision    recall  f1-score   support

          0       0.30      0.39      0.34        44
          1       0.70      0.62      0.66       102

avg / total       0.58      0.55      0.56       146

0.547945205479452


In [76]:
from sklearn.model_selection import cross_validate

scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'}

modelCV = LogisticRegression(C=1.0,penalty='l1')

results = cross_validate(modelCV, X, y, cv=10, scoring=list(scoring.values()), 
                         return_train_score=False)

print('K-fold cross-validation results:')
for sc in range(len(scoring)):
    print(modelCV.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean()
                               if list(scoring.values())[sc]=='neg_log_loss' 
                               else results['test_%s' % list(scoring.values())[sc]].mean(), 
                               results['test_%s' % list(scoring.values())[sc]].std()))

K-fold cross-validation results:
LogisticRegression average accuracy: 0.613 (+/-0.018)
LogisticRegression average log_loss: 0.670 (+/-0.016)
LogisticRegression average auc: 0.525 (+/-0.098)


In [80]:
import statsmodels.api as sm
from scipy import stats
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.652469
         Iterations 6
                               Results: Logit
Model:                    Logit                No. Iterations:       6.0000  
Dependent Variable:       Diagnosis            Pseudo R-squared:     0.022   
Date:                     2018-07-03 14:05     AIC:                  660.1998
No. Observations:         486                  BIC:                  714.6205
Df Model:                 12                   Log-Likelihood:       -317.10 
Df Residuals:             473                  LL-Null:              -324.31 
Converged:                1.0000               Scale:                1.0000  
-----------------------------------------------------------------------------
                                Coef.  Std.Err.    z    P>|z|   [0.025 0.975]
-----------------------------------------------------------------------------
age                            -0.0078   0.0062 -1.2666 0.2053 -0.0199 0

  return np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [94]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
modelCV = LogisticRegression(C=0.01,penalty='l1')
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

10-fold cross validation average accuracy: 0.611


In [106]:
only_BT = pd.read_csv("only_BT_classification.csv")

In [107]:
y=only_BT["Diagnosis"]
X = only_BT.drop('Diagnosis',1)

In [113]:
X = X.drop('Unnamed: 0',1)

In [127]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
X_test

Unnamed: 0,group.Equivocal,group.Flatliner,group.Normal BT,group.Positive h2 & Ch4,group.Positive CH4,group.Postive H2
290,0,0,0,0,0,1
177,0,0,1,0,0,0
258,0,0,0,0,0,1
294,0,0,0,0,0,1
475,0,1,0,0,0,0
48,0,0,1,0,0,0
270,0,0,1,0,0,0
55,0,0,0,0,0,1
241,0,0,0,0,0,1
432,0,0,1,0,0,0


In [120]:
# Create logistic regression
logistic = LogisticRegression()

# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

# Create grid search using 5-fold cross validation
clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)

# Fit grid search
best_model = clf.fit(X_train,y_train)

# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])


logres_model = LogisticRegression(C=10,penalty='l1')
logres_model.fit(X_train,y_train)

logres_model_predictions = logres_model.predict(X_test)
print(classification_report(y_test,logres_model_predictions))
print(accuracy_score(y_test,logres_model_predictions))


Best Penalty: l1
Best C: 1.0
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        55
          1       0.62      1.00      0.77        91

avg / total       0.39      0.62      0.48       146

0.6232876712328768


  'precision', 'predicted', average, warn_for)


In [131]:
from sklearn.svm import SVC

svc_model = SVC(C=10,gamma=10,kernel='linear')
svc_model.fit(X_train,y_train)
svc_predictions = svc_model.predict(X_test)
print(classification_report(y_test,svc_predictions))
print(accuracy_score(y_test,svc_predictions))


             precision    recall  f1-score   support

          0       0.00      0.00      0.00        48
          1       0.67      1.00      0.80        98

avg / total       0.45      0.67      0.54       146

0.6712328767123288


  'precision', 'predicted', average, warn_for)


In [155]:
only_BT_model = pd.read_csv("Only_BT_model.csv")

In [157]:
only_BT_model = only_BT_model.drop("Unnamed: 0",1)

In [158]:
y = only_BT_model['Diagnosis']
X = only_BT_model['groups']

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)


In [160]:
from sklearn.svm import SVC

svc_model = SVC(C=10,gamma=10,kernel='linear')
svc_model.fit(X_train,y_train)
svc_predictions = svc_model.predict(X_test)
print(classification_report(y_test,svc_predictions))
print(accuracy_score(y_test,svc_predictions))

ValueError: Expected 2D array, got 1D array instead:
array=[1. 1. 4. 1. 4. 4. 4. 4. 4. 4. 1. 4. 5. 1. 4. 1. 1. 4. 4. 4. 1. 1. 1. 4.
 4. 4. 1. 5. 4. 4. 1. 2. 1. 4. 4. 4. 4. 2. 1. 4. 1. 4. 4. 1. 4. 4. 4. 4.
 1. 4. 1. 1. 2. 1. 1. 4. 1. 1. 1. 4. 4. 4. 1. 4. 1. 1. 4. 1. 1. 5. 4. 4.
 4. 1. 4. 4. 1. 4. 4. 4. 4. 4. 1. 4. 1. 1. 5. 1. 4. 4. 1. 1. 4. 1. 4. 1.
 4. 1. 4. 1. 4. 1. 5. 1. 1. 1. 1. 1. 1. 1. 4. 1. 1. 4. 4. 4. 4. 1. 1. 1.
 4. 1. 4. 4. 1. 4. 1. 4. 4. 4. 4. 4. 1. 1. 4. 4. 5. 1. 1. 2. 1. 4. 4. 1.
 4. 2. 1. 1. 1. 1. 4. 1. 4. 5. 1. 1. 1. 4. 4. 4. 4. 5. 4. 4. 1. 4. 4. 1.
 4. 3. 4. 1. 4. 1. 1. 5. 4. 1. 5. 4. 4. 4. 1. 2. 4. 4. 2. 1. 1. 1. 4. 1.
 1. 4. 5. 1. 1. 1. 4. 1. 4. 1. 4. 1. 1. 4. 4. 4. 4. 1. 4. 4. 1. 4. 4. 1.
 5. 2. 1. 4. 1. 4. 5. 4. 4. 4. 1. 1. 4. 2. 1. 4. 4. 5. 1. 6. 1. 4. 4. 1.
 1. 1. 4. 1. 4. 4. 1. 2. 1. 1. 4. 2. 1. 1. 1. 4. 4. 1. 4. 4. 1. 4. 1. 4.
 2. 4. 1. 4. 4. 4. 4. 2. 4. 1. 4. 1. 4. 1. 4. 4. 4. 5. 4. 4. 2. 4. 4. 1.
 1. 1. 1. 1. 4. 1. 1. 1. 4. 1. 4. 6. 1. 1. 1. 1. 1. 4. 1. 4. 4. 1. 4. 1.
 1. 1. 1. 4. 4. 1. 4. 2. 1. 1. 1. 4. 3. 1. 4. 1. 1. 1. 4. 4. 1. 4. 4. 2.
 4. 1. 4. 5.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.