In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split

## Things to look for each dataset
1. Cervical Cancer Dataset - looking to see if we can classify if the person has cancer or not based on their characteristics)
2. Wine Dataset - looking to see if we can classify the class of the wine based on its attributes
3. Breast Cancer Dataset - looking to see if we can classify if the culture is malignant or benign (2 or 4) based on the attributes

In [488]:
cervical_cancer = pd.read_csv("sobar-72.csv")
cervical_cancer

Unnamed: 0,behavior_sexualRisk,behavior_eating,behavior_personalHygine,intention_aggregation,intention_commitment,attitude_consistency,attitude_spontaneity,norm_significantPerson,norm_fulfillment,perception_vulnerability,perception_severity,motivation_strength,motivation_willingness,socialSupport_emotionality,socialSupport_appreciation,socialSupport_instrumental,empowerment_knowledge,empowerment_abilities,empowerment_desires,ca_cervix
0,10,13,12,4,7,9,10,1,8,7,3,14,8,5,7,12,12,11,8,1
1,10,11,11,10,14,7,7,5,5,4,2,15,13,7,6,5,5,4,4,1
2,10,15,3,2,14,8,10,1,4,7,2,7,3,3,6,11,3,3,15,1
3,10,11,10,10,15,7,7,1,5,4,2,15,13,7,4,4,4,4,4,1
4,8,11,7,8,10,7,8,1,5,3,2,15,5,3,6,12,5,4,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,10,14,14,10,15,6,7,5,15,14,10,15,13,9,8,12,12,11,9,0
68,10,12,15,10,15,8,8,5,15,14,8,12,14,11,7,13,15,11,14,0
69,10,8,11,6,10,6,4,3,13,9,8,14,12,9,7,11,12,10,10,0
70,9,12,13,10,13,6,6,5,14,13,10,13,12,11,8,12,11,13,15,0


In [382]:
wine_df = pd.read_csv("wine/wine.data", names = ["class", "alcohol", "malic acid", "ash", "alcalinity of ash", "magnesium", 
                                                 "total phenols", "flavanoids", "nonflavanoid phenols", "proanthocyanins",
                                                 "color intensity", "hue", "OD280/OD315 of diluted wines", "proline"])
wine_df = wine_df.reset_index()
wine_df = wine_df.drop(wine_df.columns[0], axis = 1)
wine_df

Unnamed: 0,class,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [145]:
breast_cancer_df = pd.read_csv("breast+cancer+wisconsin+original/breast-cancer-wisconsin.data", 
                       names = ["Sample code number", "Clump Thickness", "Uniformity of Cell Size", 
                                "Uniformity of Cell Shape", "Marginal Adhesion", 
                                "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", 
                                "Normal Nucleoli", "Mitoses", "Class"])
breast_cancer_df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [146]:
breast_cancer = breast_cancer_df[breast_cancer_df['Bare Nuclei'] != '?']
breast_cancer

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [174]:
wine_df['class'].iloc(0)[173]

3

# Logistic Regression - Wine Dataset - 50/50, 80/20, 20/80

### Round 1: 20/80 Split for Train and Test - Trial 1

In [261]:
# wine = datasets.load_wine()
# wine_X = wine.data
# wine_Y = (wine.target > 1.5).reshape(-1,1).astype(float)
# X_and_Y = np.hstack((wine_X, wine_Y))     
# np.random.seed(1)             
# np.random.shuffle(X_and_Y)     

# print(wine_X.shape)
# print(wine_Y.shape)
# print(X_and_Y[0])      

In [262]:
wine_df = wine_df[wine_df['class'] != 3]
wine_df

Unnamed: 0,class,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2,12.07,2.16,2.17,21.0,85,2.60,2.65,0.37,1.35,2.76,0.86,3.28,378
126,2,12.43,1.53,2.29,21.5,86,2.74,3.15,0.39,1.77,3.94,0.69,2.84,352
127,2,11.79,2.13,2.78,28.5,92,2.13,2.24,0.58,1.76,3.00,0.97,2.44,466
128,2,12.37,1.63,2.30,24.5,88,2.22,2.45,0.40,1.90,2.12,0.89,2.78,342


In [263]:
wine_df_shuffle = wine_df.sample(frac = 1).reset_index(drop = True)
wine_df_shuffle

Unnamed: 0,class,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline
0,2,12.08,2.08,1.70,17.5,97,2.23,2.17,0.26,1.40,3.30,1.270,2.96,710
1,1,13.83,1.57,2.62,20.0,115,2.95,3.40,0.40,1.72,6.60,1.130,2.57,1130
2,2,12.69,1.53,2.26,20.7,80,1.38,1.46,0.58,1.62,3.05,0.960,2.06,495
3,2,12.42,4.43,2.73,26.5,102,2.20,2.13,0.43,1.71,2.08,0.920,3.12,365
4,2,12.25,1.73,2.12,19.0,80,1.65,2.03,0.37,1.63,3.40,1.000,3.17,510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2,12.08,1.39,2.50,22.5,84,2.56,2.29,0.43,1.04,2.90,0.930,3.19,385
126,2,12.64,1.36,2.02,16.8,100,2.02,1.41,0.53,0.62,5.75,0.980,1.59,450
127,1,13.50,1.81,2.61,20.0,96,2.53,2.61,0.28,1.66,3.52,1.120,3.82,845
128,1,13.51,1.80,2.65,19.0,110,2.35,2.53,0.29,1.54,4.20,1.100,2.87,1095


In [290]:
wine_X = wine_df_shuffle[wine_df_shuffle.columns[1:]].to_numpy()
wine_Y = wine_df_shuffle[wine_df_shuffle.columns[0]].to_numpy()

In [297]:
#Test, train, and validation sets
X_train, X_test, y_train, y_test = train_test_split(wine_X, wine_Y, test_size=0.2, random_state=42)

In [298]:
#Classifier - imports
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [299]:
#training classifier
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
scaler = StandardScaler()
scaler.fit(X_train)
X_train_new = scaler.transform(X_train)

# call classifier and get the best hyper parameter for this case - combinations of above params
classifier = LogisticRegression(class_weight = 'balanced')
class_hyper_tune = GridSearchCV(classifier, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tune.fit(X_train_new, y_train)
print("training score for best hyperparameter " + str(class_hyper_tune.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tune.cv_results_.get('mean_test_score')[0]))
class_hyper_tune.best_estimator_

#call classifier and get the best hyper parameter for this case - trial 2
# classifier1 = LogisticRegression(solver = 'liblinear', class_weight = 'balanced')
# class_hyper_tune1 = GridSearchCV(classifier1, param_grid = params_C, cv = 3, return_train_score = True)
# class_hyper_tune1.fit(X_train_new, y_train)
# print(class_hyper_tune1.cv_results_.get('mean_train_score'))
# print(class_hyper_tune1.cv_results_.get('mean_test_score'))
# class_hyper_tune1.best_estimator_

# #call classifier and get the best hyper parameter for this case - trail 3
# classifier2 = LogisticRegression(solver = 'liblinear', class_weight = 'balanced')
# class_hyper_tune2 = GridSearchCV(classifier2, param_grid = params_C, cv = 3, return_train_score = True)
# class_hyper_tune2.fit(X_train_new, y_train)
# print(class_hyper_tune1.cv_results_.get('mean_train_score'))
# print(class_hyper_tune1.cv_results_.get('mean_test_score'))
# class_hyper_tune1.best_estimator_

training score for best hyperparameter 0.9566597653554175
test score for best hyperparameter 0.922689075630252


In [425]:
#train data
classifier_tuned = LogisticRegression(C = 1, class_weight = 'balanced')
classifier_tuned.fit(X_train, y_train)
classifier_tuned.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


1.0

In [665]:
classifier_tuned.score(X_train, y_train)

0.9519230769230769

In [428]:
wine_df_shuffle_1 = wine_df.sample(frac = 1).reset_index(drop = True)
X_train1, X_test1, y_train1, y_test1 = train_test_split(wine_X_1, wine_Y_1, test_size=0.2, random_state=42)

classifier_tuned = LogisticRegression(C = 1, class_weight = 'balanced')
classifier_tuned.fit(X_train1, y_train1)
classifier_tuned.score(X_test1, y_test1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9615384615384616

In [666]:
classifier_tuned.score(X_train1, y_train1)

0.9711538461538461

In [429]:
wine_df_shuffle_2 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X_2 = wine_df_shuffle_2[wine_df_shuffle_2.columns[1:]].to_numpy()
wine_Y_2 = wine_df_shuffle_2[wine_df_shuffle_2.columns[0]].to_numpy()
X_train2, X_test2, y_train2, y_test2 = train_test_split(wine_X_2, wine_Y_2, test_size=0.2, random_state=42)
classifier_tuned = LogisticRegression(C = 1, class_weight = 'balanced')
classifier_tuned.fit(X_train2, y_train2)
classifier_tuned.score(X_test2, y_test2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9166666666666666

In [668]:
classifier_tuned.score(X_train2, y_train2)

0.9788732394366197

In [660]:
(0.9166666666666666 + 0.9615384615384616 + 1.0) / 3

0.9594017094017094

In [715]:
(0.9788732394366197 + 0.9711538461538461 + 0.9519230769230769
) / 3

0.9673167208378475

## Round 2 - 50/50 Split for Wine Data

In [317]:
wine_df_shuffle_new = wine_df.sample(frac = 1).reset_index(drop = True)
wine_df_shuffle_new

Unnamed: 0,class,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline
0,1,13.05,2.05,3.22,25.0,124,2.63,2.68,0.47,1.92,3.58,1.13,3.20,830
1,2,11.82,1.47,1.99,20.8,86,1.98,1.60,0.30,1.53,1.95,0.95,3.33,495
2,2,12.43,1.53,2.29,21.5,86,2.74,3.15,0.39,1.77,3.94,0.69,2.84,352
3,1,13.39,1.77,2.62,16.1,93,2.85,2.94,0.34,1.45,4.80,0.92,3.22,1195
4,1,14.06,2.15,2.61,17.6,121,2.60,2.51,0.31,1.25,5.05,1.06,3.58,1295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2,12.33,0.99,1.95,14.8,136,1.90,1.85,0.35,2.76,3.40,1.06,2.31,750
126,2,12.42,2.55,2.27,22.0,90,1.68,1.84,0.66,1.42,2.70,0.86,3.30,315
127,2,12.60,1.34,1.90,18.5,88,1.45,1.36,0.29,1.35,2.45,1.04,2.77,562
128,1,13.83,1.57,2.62,20.0,115,2.95,3.40,0.40,1.72,6.60,1.13,2.57,1130


In [318]:
wine_X_new = wine_df_shuffle_new[wine_df_shuffle_new.columns[1:]].to_numpy()
wine_Y_new = wine_df_shuffle_new[wine_df_shuffle_new.columns[0]].to_numpy()

In [321]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(wine_X_new, wine_Y_new, test_size=0.5, random_state=42)

In [322]:
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
scaler_new = StandardScaler()
scaler_new.fit(X_train_new)
X_train_new = scaler.transform(X_train_new)

# call classifier and get the best hyper parameter for this case - combinations of above params
classifier1 = LogisticRegression(class_weight = 'balanced')
class_hyper_tune1 = GridSearchCV(classifier1, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tune1.fit(X_train_new, y_train_new)
print("training score for best hyperparameter " + str(class_hyper_tune1.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tune1.cv_results_.get('mean_test_score')[0]))
class_hyper_tune1.best_estimator_

training score for best hyperparameter 0.9770965468639887
test score for best hyperparameter 0.9696969696969697


In [323]:
classifier_tuned1 = LogisticRegression(C = 0.1, class_weight = 'balanced')
classifier_tuned1.fit(X_train_new, y_train_new)
classifier_tuned1.score(X_test_new, y_test_new)

0.4153846153846154

In [324]:
wine_df_shuffle_2 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X_2 = wine_df_shuffle_2[wine_df_shuffle_2.columns[1:]].to_numpy()
wine_Y_2 = wine_df_shuffle_2[wine_df_shuffle_2.columns[0]].to_numpy()
X_train2, X_test2, y_train2, y_test2 = train_test_split(wine_X_2, wine_Y_2, test_size=0.5, random_state=42)
classifier_tuned1.fit(X_train2, y_train2)
classifier_tuned1.score(X_test2, y_test2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9538461538461539

In [325]:
wine_df_shuffle_3 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X_3 = wine_df_shuffle_3[wine_df_shuffle_3.columns[1:]].to_numpy()
wine_Y_3 = wine_df_shuffle_3[wine_df_shuffle_3.columns[0]].to_numpy()
X_train3, X_test3, y_train3, y_test3 = train_test_split(wine_X_3, wine_Y_3, test_size=0.5, random_state=42)
classifier_tuned1.fit(X_train3, y_train3)
classifier_tuned1.score(X_test3, y_test3)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9692307692307692

In [672]:
((0.4153846153846154) + (0.9692307692307692) + (0.9538461538461539)) / 3

0.7794871794871795

In [716]:
(classifier_tuned1.score(X_train_new, y_train_new) + classifier_tuned1.score(X_train2, y_train2) + classifier_tuned1.score(X_train3, y_train3)) / 3

0.7298302636330805

## Round 3 - 80/20 Split for Wine Data

#### Trial 1

In [674]:
wine_df_shuffle_80 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X_80 = wine_df_shuffle_80[wine_df_shuffle_80.columns[1:]].to_numpy()
wine_Y_80 = wine_df_shuffle_80[wine_df_shuffle_80.columns[0]].to_numpy()
X_train80, X_test80, y_train80, y_test80 = train_test_split(wine_X_80, wine_Y_80, test_size=0.8, random_state=42)

In [676]:
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
# scaler_new80 = StandardScaler()
# scaler_new80.fit(X_train80)
# X_train80_new = scaler.transform(X_train80)

# call classifier and get the best hyper parameter for this case - combinations of above params
classifier8 = LogisticRegression(class_weight = 'balanced')
class_hyper_tune8 = GridSearchCV(classifier8, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tune8.fit(X_train80, y_train80)
print("training score for best hyperparameter " + str(class_hyper_tune8.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tune8.cv_results_.get('mean_test_score')[0]))
class_hyper_tune8.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

training score for best hyperparameter 0.8852657004830918
test score for best hyperparameter 0.8308080808080809


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [677]:
classifier_tuned8 = LogisticRegression(C = 0.1, class_weight = 'balanced')
classifier_tuned8.fit(X_train80, y_train80)
classifier_tuned8.score(X_test80, y_test80)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7832167832167832

In [671]:
classifier_tuned8.score(X_train80, y_train80)

0.7714285714285715

#### Trial 2 

In [678]:
wine_df_shuffle_ = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X_ = wine_df_shuffle_[wine_df_shuffle_.columns[1:]].to_numpy()
wine_Y_ = wine_df_shuffle_[wine_df_shuffle_.columns[0]].to_numpy()
X_train_, X_test_, y_train_, y_test_ = train_test_split(wine_X_, wine_Y_, test_size=0.8, random_state=42)
classifier_tuned8.fit(X_train_, y_train_)
classifier_tuned8.score(X_test_, y_test_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.916083916083916

#### Trial 3

In [679]:
wine_df_shuffle1 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X1 = wine_df_shuffle1[wine_df_shuffle_.columns[1:]].to_numpy()
wine_Y1 = wine_df_shuffle1[wine_df_shuffle_.columns[0]].to_numpy()
X_train1, X_test1, y_train1, y_test1 = train_test_split(wine_X1, wine_Y1, test_size=0.8, random_state=42)
classifier_tuned8.fit(X_train1, y_train1)
classifier_tuned8.score(X_test1, y_test1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9300699300699301

In [680]:
(0.7714285714285715 + 0.916083916083916 + 0.9300699300699301) / 3

0.8725274725274725

In [717]:
(classifier_tuned8.score(X_train80, y_train80) + classifier_tuned8.score(X_train1, y_train1) + classifier_tuned8.score(X_train_, y_train_)) / 3

0.980952380952381

# KNN - Wine Data Set - 20/80, 50/50, 80/20

## KNN - 80/20

In [337]:
#Train and Test Splits 
from sklearn.neighbors import KNeighborsClassifier
wine_df_shuffled = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xs = wine_df_shuffled[wine_df_shuffled.columns[1:]].to_numpy()
wine_Ys = wine_df_shuffled[wine_df_shuffled.columns[0]].to_numpy()
X_trains, X_tests, y_trains, y_tests = train_test_split(wine_Xs, wine_Ys, test_size=0.2, random_state=42)

params = {'n_neighbors' : [3, 5, 7, 10]}

# have to scale data you ran into an error

# call classifier and get the best hyper parameter for this case - combinations of above params
classifiers = KNeighborsClassifier(weights = 'uniform')
class_hyper_tunes = GridSearchCV(classifiers, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunes.fit(X_trains, y_trains)
print("training score for best hyperparameter " + str(class_hyper_tunes.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunes.cv_results_.get('mean_test_score')[0]))
class_hyper_tunes.best_estimator_



training score for best hyperparameter 0.9567287784679089
test score for best hyperparameter 0.884313725490196


In [338]:
tuned_KNN = KNeighborsClassifier(n_neighbors = 10)
tuned_KNN.fit(X_trains, y_trains)
tuned_KNN.score(X_tests, y_tests)

0.9615384615384616

#### Trial 2

In [339]:
wine_df_shuffleds = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xss = wine_df_shuffleds[wine_df_shuffleds.columns[1:]].to_numpy()
wine_Yss = wine_df_shuffleds[wine_df_shuffleds.columns[0]].to_numpy()
X_trainss, X_testss, y_trainss, y_testss = train_test_split(wine_Xss, wine_Yss, test_size=0.2, random_state=42)
tuned_KNN.fit(X_trainss, y_trainss)
tuned_KNN.score(X_testss, y_testss)

0.8461538461538461

#### Trial 3

In [719]:
wine_df_shuffledsw = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xssw = wine_df_shuffledsw[wine_df_shuffledsw.columns[1:]].to_numpy()
wine_Yssw = wine_df_shuffledsw[wine_df_shuffledsw.columns[0]].to_numpy()
X_trainssw, X_testssw, y_trainssw, y_testssw = train_test_split(wine_Xssw, wine_Yssw, test_size=0.2, random_state=42)
tuned_KNN.fit(X_trainssw, y_trainssw)
tuned_KNN.score(X_testssw, y_testssw)

0.6944444444444444

In [720]:
(0.9615384615384616 + 0.8461538461538461 + 0.9230769230769231) / 3

0.9102564102564102

In [721]:
(tuned_KNN.score(X_trainssw, y_trainssw) + 
tuned_KNN.score(X_trainss, y_trainss) + 
tuned_KNN.score(X_trains, y_trains)) / 3

0.8280516431924884

## KNN 20/80

In [344]:
wine_df_shuff = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xf = wine_df_shuff[wine_df_shuff.columns[1:]].to_numpy()
wine_Ysf = wine_df_shuff[wine_df_shuff.columns[0]].to_numpy()
X_trainsf, X_testsf, y_trainsf, y_testsf = train_test_split(wine_Xf, wine_Ysf, test_size=0.8, random_state=42)

params = {'n_neighbors' : [3, 5, 7, 10]}

# call classifier and get the best hyper parameter for this case - combinations of above params
classifiersf = KNeighborsClassifier(weights = 'uniform')
class_hyper_tunesf = GridSearchCV(classifiersf, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunesf.fit(X_trainsf, y_trainsf)
print("training score for best hyperparameter " + str(class_hyper_tunesf.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunesf.cv_results_.get('mean_test_score')[0]))
class_hyper_tunesf.best_estimator_


training score for best hyperparameter 0.9041394335511983
test score for best hyperparameter 0.8888888888888888


#### Trial 1

In [345]:
tuned_KNN80 = KNeighborsClassifier(n_neighbors = 3)
tuned_KNN80.fit(X_trainsf, y_trainsf)
tuned_KNN80.score(X_testsf, y_testsf)

0.8653846153846154

#### Trial 2

In [346]:
wine_df_shuffs = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xe = wine_df_shuffs[wine_df_shuffs.columns[1:]].to_numpy()
wine_Ye = wine_df_shuffs[wine_df_shuffs.columns[0]].to_numpy()
X_trainse, X_testse, y_trainse, y_testse = train_test_split(wine_Xe, wine_Ye, test_size=0.8, random_state=42)
tuned_KNN80.fit(X_trainse, y_trainse)
tuned_KNN80.score(X_testse, y_testse)

0.9038461538461539

#### Trial 3

In [347]:
wine_df_shuffss = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xes = wine_df_shuffss[wine_df_shuffss.columns[1:]].to_numpy()
wine_Yes = wine_df_shuffss[wine_df_shuffss.columns[0]].to_numpy()
X_trainses, X_testses, y_trainses, y_testses = train_test_split(wine_Xes, wine_Yes, test_size=0.8, random_state=42)
tuned_KNN80.fit(X_trainses, y_trainses)
tuned_KNN80.score(X_testses, y_testses)

0.9134615384615384

In [682]:
(0.9134615384615384 + 0.9038461538461539 + 0.8653846153846154) / 3

0.8942307692307692

In [722]:
(tuned_KNN80.score(X_trainsf, y_trainsf) + 
 tuned_KNN80.score(X_trainse, y_trainse) + 
 tuned_KNN80.score(X_trainses, y_trainses)) / 3

0.935897435897436

## KNN 50/50

#### Trial 1

In [350]:
wine_df_shuf = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xu = wine_df_shuf[wine_df_shuf.columns[1:]].to_numpy()
wine_Yu = wine_df_shuf[wine_df_shuf.columns[0]].to_numpy()
X_trainu, X_testu, y_trainu, y_testu = train_test_split(wine_Xu, wine_Yu, test_size=0.5, random_state=42)

params = {'n_neighbors' : [3, 5, 7, 10]}

# call classifier and get the best hyper parameter for this case - combinations of above params
classifiersu = KNeighborsClassifier(weights = 'uniform')
class_hyper_tunesu = GridSearchCV(classifiersf, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunesu.fit(X_trainu, y_trainu)
print("training score for best hyperparameter " + str(class_hyper_tunesu.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunesu.cv_results_.get('mean_test_score')[0]))
class_hyper_tunesu.best_estimator_

training score for best hyperparameter 0.9460887949260043
test score for best hyperparameter 0.937950937950938


#### Trial 1

In [724]:
tuned_50 = KNeighborsClassifier(n_neighbors = 3)
tuned_50.fit(X_trainu, y_trainu)
tuned_50.score(X_testu, y_testu)

0.9230769230769231

#### Trial 2

In [725]:
wine_df_shufl = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xul = wine_df_shufl[wine_df_shufl.columns[1:]].to_numpy()
wine_Yul = wine_df_shufl[wine_df_shufl.columns[0]].to_numpy()
X_trainul, X_testul, y_trainul, y_testul = train_test_split(wine_Xul, wine_Yul, test_size=0.5, random_state=42)
tuned_50.fit(X_trainul, y_trainul)
tuned_50.score(X_testul, y_testul)

0.6741573033707865

#### Trial 3

In [726]:
wine_df_shufle = wine_df.sample(frac = 1).reset_index(drop = True)
wine_Xule = wine_df_shufle[wine_df_shufle.columns[1:]].to_numpy()
wine_Yule = wine_df_shufle[wine_df_shufle.columns[0]].to_numpy()
X_trainule, X_testule, y_trainule, y_testule = train_test_split(wine_Xule, wine_Yule, test_size=0.5, random_state=42)
tuned_50.fit(X_trainule, y_trainule)
tuned_50.score(X_testule, y_testule)

0.6404494382022472

In [727]:
(0.9230769230769231 + 0.9076923076923077 + 0.9384615384615385) /3

0.923076923076923

In [728]:
(tuned_50.score(X_trainul, y_trainul) + 
tuned_50.score(X_trainule, y_trainule) + 
tuned_50.score(X_trainu, y_trainu)) / 3

0.7882454624027657

# SVM on Wine Dataset - 20/80, 50/50, 80/20

## 20/80 Split

In [686]:
from sklearn.svm import SVC
wine_df_SVM = wine_df.sample(frac = 1).reset_index(drop = True)
wine_XS = wine_df_SVM[wine_df_SVM.columns[1:]].to_numpy()
wine_YS = wine_df_SVM[wine_df_SVM.columns[0]].to_numpy()
X_trainS, X_testS, y_trainS, y_testS = train_test_split(wine_XS, wine_YS, test_size=0.8, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
# scaler_newSVM = StandardScaler()
# scaler_newSVM.fit(X_trainS)
# X_train_SVM = scaler.transform(X_trainS)


# call classifier and get the best hyper parameter for this case - combinations of above params
classifiere = SVC(class_weight = 'balanced')
class_hyper_tunee = GridSearchCV(classifiere, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunee.fit(X_trainS, y_trainS)
print("training score for best hyperparameter " + str(class_hyper_tunee.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunee.cv_results_.get('mean_test_score')[0]))
class_hyper_tunee.best_estimator_

training score for best hyperparameter 0.7294685990338164
test score for best hyperparameter 0.48484848484848486


#### Trial 1

In [687]:
tuned_SVM = SVC(C = 0.1, kernel = "linear", class_weight = 'balanced')
tuned_SVM.fit(X_trainS, y_trainS)
tuned_SVM.score(X_testS, y_testS)

0.9020979020979021

#### Trial 2

In [688]:
wine_df_SVM1 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_XS1 = wine_df_SVM1[wine_df_SVM1.columns[1:]].to_numpy()
wine_YS1 = wine_df_SVM1[wine_df_SVM1.columns[0]].to_numpy()
X_trainS1, X_testS1, y_trainS1, y_testS1 = train_test_split(wine_XS1, wine_YS1, test_size=0.8, random_state=42)

tuned_SVM.fit(X_trainS1, y_trainS1)
tuned_SVM.score(X_testS1, y_testS1)

0.7062937062937062

#### Trial 3

In [689]:
wine_df_SVM2 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_XS2 = wine_df_SVM2[wine_df_SVM2.columns[1:]].to_numpy()
wine_YS2 = wine_df_SVM2[wine_df_SVM2.columns[0]].to_numpy()
X_trainS2, X_testS2, y_trainS2, y_testS2 = train_test_split(wine_XS2, wine_YS2, test_size=0.8, random_state=42)

tuned_SVM.fit(X_trainS2, y_trainS2)
tuned_SVM.score(X_testS2, y_testS2)

0.916083916083916

In [690]:
(0.9020979020979021 + 0.7062937062937062 + 0.916083916083916) /3

0.8414918414918414

In [729]:
(tuned_SVM.score(X_trainS2, y_trainS2) + 
tuned_SVM.score(X_trainS1, y_trainS1) + 
tuned_SVM.score(X_trainS, y_trainS)) / 3

0.9428571428571427

## 50/50 Split

## Hyper Parameter Tuning

In [403]:
wine_df_S = wine_df.sample(frac = 1).reset_index(drop = True)
wine_XSS = wine_df_S[wine_df_S.columns[1:]].to_numpy()
wine_YSS = wine_df_S[wine_df_S.columns[0]].to_numpy()
X_trainSS, X_testSS, y_trainSS, y_testSS = train_test_split(wine_XSS, wine_YSS, test_size=0.5, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
# scaler_newSVM = StandardScaler()
# scaler_newSVM.fit(X_trainS)
# X_train_SVM = scaler.transform(X_trainS)


# call classifier and get the best hyper parameter for this case - combinations of above params
classifiers = SVC(class_weight = 'balanced')
class_hyper_tunes = GridSearchCV(classifiers, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunes.fit(X_trainSS, y_trainSS)
print("training score for best hyperparameter " + str(class_hyper_tunes.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunes.cv_results_.get('mean_test_score')[0]))
class_hyper_tunes.best_estimator_

training score for best hyperparameter 0.797834274952919
test score for best hyperparameter 0.7524904214559386


#### Trial 1

In [404]:
tuned_50 = SVC(class_weight = 'balanced', C = 1, kernel = 'linear')
tuned_50.fit(X_trainSS, y_trainSS)
tuned_50.score(X_testSS, y_testSS)

0.9213483146067416

#### Trial 2

In [406]:
wine_df_T = wine_df.sample(frac = 1).reset_index(drop = True)
wine_XST = wine_df_T[wine_df_T.columns[1:]].to_numpy()
wine_YST = wine_df_T[wine_df_T.columns[0]].to_numpy()
X_trainST, X_testST, y_trainST, y_testST = train_test_split(wine_XST, wine_YST, test_size=0.5, random_state=42)

tuned_50.fit(X_trainST, y_trainST)
tuned_50.score(X_testST, y_testST)

0.9550561797752809

#### Trial 3

In [407]:
wine_df_U = wine_df.sample(frac = 1).reset_index(drop = True)
wine_XSU = wine_df_U[wine_df_U.columns[1:]].to_numpy()
wine_YSU = wine_df_U[wine_df_U.columns[0]].to_numpy()
X_trainSU, X_testSU, y_trainSU, y_testSU = train_test_split(wine_XSU, wine_YSU, test_size=0.5, random_state=42)

tuned_50.fit(X_trainSU, y_trainSU)
tuned_50.score(X_testSU, y_testSU)

0.9887640449438202

In [692]:
(0.9887640449438202 + 0.9550561797752809 + 0.9213483146067416) / 3

0.9550561797752809

In [730]:
(tuned_50.score(X_trainSU, y_trainSU) + 
 tuned_50.score(X_trainST, y_trainST) + 
tuned_50.score(X_trainSS, y_trainSS)) / 3

0.7228464419475656

## 80/20 Split

## Hyperparameter Tuning

In [408]:
wine_df_80 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X80 = wine_df_80[wine_df_80.columns[1:]].to_numpy()
wine_Y80 = wine_df_S[wine_df_80.columns[0]].to_numpy()
X_train80, X_test80, y_train80, y_test80 = train_test_split(wine_X80, wine_Y80, test_size=0.2, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
# scaler_newSVM = StandardScaler()
# scaler_newSVM.fit(X_trainS)
# X_train_SVM = scaler.transform(X_trainS)


# call classifier and get the best hyper parameter for this case - combinations of above params
classifiers80 = SVC(class_weight = 'balanced')
class_hyper_tune80 = GridSearchCV(classifiers80, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tune80.fit(X_train80, y_train80)
print("training score for best hyperparameter " + str(class_hyper_tune80.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tune80.cv_results_.get('mean_test_score')[0]))
class_hyper_tune80.best_estimator_

training score for best hyperparameter 0.3346024636058231
test score for best hyperparameter 0.30289598108747046


#### Trial 1

In [733]:
wine_df_801 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X801 = wine_df_801[wine_df_801.columns[1:]].to_numpy()
wine_Y801 = wine_df_801[wine_df_801.columns[0]].to_numpy()
X_train801, X_test801, y_train801, y_test801 = train_test_split(wine_X801, wine_Y801, test_size=0.2, random_state=42)

classifiers801 = SVC(class_weight = 'balanced')
class_hyper_tune801 = GridSearchCV(classifiers801, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tune801.fit(X_train801, y_train801)
print("training score for best hyperparameter " + str(class_hyper_tune801.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tune801.cv_results_.get('mean_test_score')[0]))
class_hyper_tune801.best_estimator_

training score for best hyperparameter 0.8204180664427025
test score for best hyperparameter 0.7752659574468085


In [734]:
tuned_8020 = SVC(class_weight = 'balanced', kernel = 'linear', C = 1)
tuned_8020.fit(X_train801, y_train801)
tuned_8020.score(X_test801, y_test801)

0.9722222222222222

#### Trial 2

In [735]:
wine_df_802 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X802 = wine_df_802[wine_df_802.columns[1:]].to_numpy()
wine_Y802 = wine_df_802[wine_df_802.columns[0]].to_numpy()
X_train802, X_test802, y_train802, y_test802 = train_test_split(wine_X802, wine_Y802, test_size=0.2, random_state=42)

tuned_8020.fit(X_train802, y_train802)
tuned_8020.score(X_test802, y_test802)

0.9722222222222222

#### Trial 3

In [736]:
wine_df_803 = wine_df.sample(frac = 1).reset_index(drop = True)
wine_X803 = wine_df_803[wine_df_803.columns[1:]].to_numpy()
wine_Y803 = wine_df_803[wine_df_803.columns[0]].to_numpy()
X_train803, X_test803, y_train803, y_test803 = train_test_split(wine_X803, wine_Y803, test_size=0.2, random_state=42)

tuned_8020.fit(X_train803, y_train803)
tuned_8020.score(X_test803, y_test803)

0.9444444444444444

In [737]:
(0.9444444444444444 + 0.9722222222222222 + 0.9166666666666666) / 3

0.9444444444444443

In [738]:
(tuned_8020.score(X_train803, y_train803) + 
 tuned_8020.score(X_train802, y_train802) + 
 tuned_8020.score(X_train801, y_train801)) / 3

0.9906103286384976

# Logistic Regression with Breast Cancer data

In [491]:
breast_cancer_df = breast_cancer_df[breast_cancer_df[breast_cancer_df.columns[6]] != '?']
breast_cancer_df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [459]:
breast_cancer_df.shape

(683, 11)

In [418]:
#Initial code for splitting
cancer_shuffle = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X = cancer_shuffle[cancer_shuffle.columns[1:10]].to_numpy()
bc_Y = cancer_shuffle[cancer_shuffle.columns[10]].to_numpy()

## 20/80 Split

In [419]:
X_trainbc, X_testbc, y_trainbc, y_testbc = train_test_split(bc_X, bc_Y, test_size = 0.8, random_state = 42)
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
scaler = StandardScaler()
scaler.fit(X_trainbc)
X_train_newbc = scaler.transform(X_trainbc)

# call classifier and get the best hyper parameter for this case - combinations of above params
classi = LogisticRegression(class_weight = 'balanced')
class_hyper_tunei = GridSearchCV(classi, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunei.fit(X_train_newbc, y_trainbc)
print("training score for best hyperparameter " + str(class_hyper_tunei.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunei.cv_results_.get('mean_test_score')[0]))
class_hyper_tunei.best_estimator_

training score for best hyperparameter 0.9595441595441595
test score for best hyperparameter 0.9631239935587761


#### Trial 1

In [694]:
tuned_LR_BC = LogisticRegression(C = 1, class_weight = 'balanced', solver = 'liblinear')
tuned_LR_BC.fit(X_train_newbc, y_trainbc)
tuned_LR_BC.score(X_testbc, y_testbc)

0.34186471663619744

#### Trial 2

In [695]:
cancer_shuffle1 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X1 = cancer_shuffle1[cancer_shuffle1.columns[1:10]].to_numpy()
bc_Y1 = cancer_shuffle1[cancer_shuffle1.columns[10]].to_numpy()
X_trainbc1, X_testbc1, y_trainbc1, y_testbc1 = train_test_split(bc_X1, bc_Y1, test_size = 0.8, random_state = 42)

scaler1 = StandardScaler()
scaler.fit(X_trainbc1)
X_train_newbc1 = scaler.transform(X_trainbc1)

tuned_LR_BC.fit(X_train_newbc1, y_trainbc1)
tuned_LR_BC.score(X_testbc1, y_testbc1)

0.3473491773308958

#### Trial 3

In [696]:
cancer_shuffle2 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X2 = cancer_shuffle2[cancer_shuffle2.columns[1:10]].to_numpy()
bc_Y2 = cancer_shuffle2[cancer_shuffle2.columns[10]].to_numpy()
X_trainbc2, X_testbc2, y_trainbc2, y_testbc2 = train_test_split(bc_X2, bc_Y2, test_size = 0.8, random_state = 42)

scaler2 = StandardScaler()
scaler.fit(X_trainbc2)
X_train_newbc2 = scaler.transform(X_trainbc2)

tuned_LR_BC.fit(X_train_newbc2, y_trainbc2)
tuned_LR_BC.score(X_testbc2, y_testbc2)

0.33638025594149906

In [697]:
(0.34186471663619744 + 0.3473491773308958 + 0.33638025594149906) / 3

0.34186471663619744

In [739]:
(tuned_LR_BC.score(X_train_newbc2, y_trainbc2) + 
tuned_LR_BC.score(X_train_newbc1, y_trainbc1) + 
tuned_LR_BC.score(X_train_newbc, y_trainbc)) / 3

0.9681372549019608

## 50/50 Split

In [448]:
cancer_shuffleds = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xds = cancer_shuffleds[cancer_shuffleds.columns[1:10]].to_numpy()
bc_Yds = cancer_shuffleds[cancer_shuffleds.columns[10]].to_numpy()
X_trainbcds, X_testbcds, y_trainbcds, y_testbcds = train_test_split(bc_Xds, bc_Yds, test_size = 0.5, random_state = 42)
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
scaler50 = StandardScaler()
scaler50.fit(X_trainbcds)
X_train_newbcds = scaler.transform(X_trainbcds)

# call classifier and get the best hyper parameter for this case - combinations of above params
classier = LogisticRegression(class_weight = 'balanced')
classier_hyper_tune = GridSearchCV(classier, param_grid = params, cv = 3, return_train_score = True)
classier_hyper_tune.fit(X_trainbcds, y_trainbcds)
print("training score for best hyperparameter " + str(classier_hyper_tune.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(classier_hyper_tune.cv_results_.get('mean_test_score')[0]))
classier_hyper_tune.best_estimator_

training score for best hyperparameter 0.9692080789344875
test score for best hyperparameter 0.9618589245976298


#### Trial 1

In [449]:
tuned_50_LR = LogisticRegression(C = 0.01, class_weight = 'balanced', solver = 'lbfgs')
tuned_50_LR.fit(X_trainbcds, y_trainbcds)
tuned_50_LR.score(X_testbcds, y_testbcds)

0.9678362573099415

#### Trial 2

In [457]:
cancer_shuffled50s = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd50s = cancer_shuffled50s[cancer_shuffled50s.columns[1:10]].to_numpy()
bc_Yd50s = cancer_shuffled50s[cancer_shuffled50s.columns[10]].to_numpy()
X_train50s, X_test50s, y_train50s, y_test50s = train_test_split(bc_Xd50s, bc_Yd50s, test_size = 0.5, random_state = 42)

tuned_50_LR.fit(X_train50s, y_train50s)
tuned_50_LR.score(X_test50s, y_test50s)

0.9824561403508771

#### Trial 3

In [458]:
cancer_shuffle5s = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X5s = cancer_shuffle5s[cancer_shuffle5s.columns[1:10]].to_numpy()
bc_Y5s = cancer_shuffle5s[cancer_shuffle5s.columns[10]].to_numpy()
X_train5s, X_test5s, y_train5s, y_test5s = train_test_split(bc_X5s, bc_Y5s, test_size = 0.5, random_state = 42)

tuned_50_LR.fit(X_train5s, y_train5s)
tuned_50_LR.score(X_test5s, y_test5s)

0.9707602339181286

In [698]:
(0.9707602339181286 + 0.9824561403508771 + 0.9678362573099415) / 3

0.9736842105263158

In [740]:
(tuned_50_LR.score(X_train5s, y_train5s) + 
tuned_50_LR.score(X_train50s, y_train50s) + 
tuned_50_LR.score(X_trainbcds, y_trainbcds)) / 3

0.9736070381231672

## 80/20 Split

In [451]:
cancer_shuffle20 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X20 = cancer_shuffle20[cancer_shuffle20.columns[1:10]].to_numpy()
bc_Y20 = cancer_shuffle20[cancer_shuffle20.columns[10]].to_numpy()
X_train20, X_test20, y_train20, y_test20 = train_test_split(bc_X20, bc_Y20, test_size = 0.2, random_state = 42)
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
scaler20 = StandardScaler()
scaler20.fit(X_train20)
X_train_new20 = scaler.transform(X_train20)

# call classifier and get the best hyper parameter for this case - combinations of above params
classier20 = LogisticRegression(class_weight = 'balanced')
classier20_hyper_tune = GridSearchCV(classier20, param_grid = params, cv = 3, return_train_score = True)
classier20_hyper_tune.fit(X_train20, y_train20)
print("training score for best hyperparameter " + str(classier20_hyper_tune.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(classier20_hyper_tune.cv_results_.get('mean_test_score')[0]))
classier20_hyper_tune.best_estimator_

training score for best hyperparameter 0.9752747252747253
test score for best hyperparameter 0.9743589743589745


#### Trial 1

In [452]:
tuned_20_LR = LogisticRegression(C = 0.1, class_weight = 'balanced', solver = 'lbfgs')
tuned_20_LR.fit(X_train20, y_train20)
tuned_20_LR.score(X_test20, y_test20)

0.9562043795620438

#### Trial 2

In [454]:
cancer_shuffle21 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X21 = cancer_shuffle21[cancer_shuffle21.columns[1:10]].to_numpy()
bc_Y21 = cancer_shuffle21[cancer_shuffle21.columns[10]].to_numpy()
X_train21, X_test21, y_train21, y_test21 = train_test_split(bc_X21, bc_Y21, test_size = 0.2, random_state = 42)

tuned_20_LR.fit(X_train21, y_train21)
tuned_20_LR.score(X_test21, y_test21)

0.9854014598540146

#### Trial 3

In [456]:
cancer_shuffle22 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_X22 = cancer_shuffle22[cancer_shuffle22.columns[1:10]].to_numpy()
bc_Y22 = cancer_shuffle22[cancer_shuffle22.columns[10]].to_numpy()
X_train22, X_test22, y_train22, y_test22 = train_test_split(bc_X22, bc_Y22, test_size = 0.2, random_state = 42)

tuned_20_LR.fit(X_train22, y_train22)
tuned_20_LR.score(X_test22, y_test22)

0.948905109489051

In [699]:
(0.9562043795620438 + 0.9854014598540146 + 0.948905109489051) / 3

0.9635036496350365

In [741]:
(tuned_20_LR.score(X_train22, y_train22) + 
 tuned_20_LR.score(X_train21, y_train21) + 
tuned_20_LR.score(X_train20, y_train20)) / 3

0.9774114774114774

# KNN with Breast Cancer Data

## 20/80 Split

In [743]:
#KNN 
cancer_shuffled = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd = cancer_shuffled[cancer_shuffled.columns[1:10]].to_numpy()
bc_Yd = cancer_shuffled[cancer_shuffled.columns[10]].to_numpy()
X_traind, X_testd, y_traind, y_testd = train_test_split(bc_Xd, bc_Yd, test_size = 0.8, random_state = 42)

params = {'n_neighbors' : [3, 5, 7, 10]}

# have to scale data you ran into an error

# call classifier and get the best hyper parameter for this case - combinations of above params
classifierd = KNeighborsClassifier(weights = 'uniform')
class_hyper_tuned = GridSearchCV(classifierd, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuned.fit(X_traind, y_traind)
print("training score for best hyperparameter " + str(class_hyper_tuned.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuned.cv_results_.get('mean_test_score')[0]))
class_hyper_tuned.best_estimator_

training score for best hyperparameter 0.9669108669108669
test score for best hyperparameter 0.9484702093397747


#### Trial 1

In [744]:
tuned_KNN = KNeighborsClassifier(n_neighbors = 3, weights = 'uniform')
tuned_KNN.fit(X_traind, y_traind)
tuned_KNN.score(X_testd, y_testd)

0.9689213893967094

#### Trial 2

In [432]:
cancer_shuffled3 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd3 = cancer_shuffled3[cancer_shuffled3.columns[1:10]].to_numpy()
bc_Yd3 = cancer_shuffled3[cancer_shuffled3.columns[10]].to_numpy()
X_traind3, X_testd3, y_traind3, y_testd3 = train_test_split(bc_Xd3, bc_Yd3, test_size = 0.8, random_state = 42)

tuned_KNN.fit(X_traind3, y_traind3)
tuned_KNN.score(X_testd3, y_testd3)

0.9579524680073126

#### Trial 3

In [434]:
cancer_shuffled4 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd4 = cancer_shuffled4[cancer_shuffled4.columns[1:10]].to_numpy()
bc_Yd4 = cancer_shuffled4[cancer_shuffled4.columns[10]].to_numpy()
X_traind4, X_testd4, y_traind4, y_testd4 = train_test_split(bc_Xd4, bc_Yd4, test_size = 0.8, random_state = 42)

tuned_KNN.fit(X_traind4, y_traind4)
tuned_KNN.score(X_testd4, y_testd4)

0.9670932358318098

In [701]:
(0.979890310786106 + 0.9579524680073126 + 0.9670932358318098) / 3

0.9683120048750761

In [745]:
(tuned_KNN.score(X_traind4, y_traind4) + 
tuned_KNN.score(X_traind3, y_traind3) + 
tuned_KNN.score(X_traind, y_traind)) / 3

0.9622865275142316

## 50/50 Split

### Trial 1

In [747]:
cancer_shuffled4 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd4 = cancer_shuffled4[cancer_shuffled4.columns[1:10]].to_numpy()
bc_Yd4 = cancer_shuffled4[cancer_shuffled4.columns[10]].to_numpy()
X_traind4, X_testd4, y_traind4, y_testd4 = train_test_split(bc_Xd4, bc_Yd4, test_size = 0.5, random_state = 42)

params = {'n_neighbors' : [3, 5, 7, 10]}

# have to scale data you ran into an error

# call classifier and get the best hyper parameter for this case - combinations of above params
classifier4 = KNeighborsClassifier(weights = 'uniform')
class_hyper_tuned4 = GridSearchCV(classifier4, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuned4.fit(X_traind4, y_traind4)
print("training score for best hyperparameter " + str(class_hyper_tuned4.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuned4.cv_results_.get('mean_test_score')[0]))
class_hyper_tuned4.best_estimator_

training score for best hyperparameter 0.9867905814462735
test score for best hyperparameter 0.9706049785230036


In [748]:
tuned_50 = KNeighborsClassifier(n_neighbors = 3, weights ='uniform')
tuned_50.fit(X_traind4, y_traind4)
tuned_50.score(X_testd4, y_testd4)

0.9707602339181286

#### Trial 2

In [438]:
cancer_shuffled5 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd5 = cancer_shuffled5[cancer_shuffled5.columns[1:10]].to_numpy()
bc_Yd5 = cancer_shuffled5[cancer_shuffled5.columns[10]].to_numpy()
X_traind5, X_testd5, y_traind5, y_testd5 = train_test_split(bc_Xd5, bc_Yd5, test_size = 0.5, random_state = 42)

tuned_50.fit(X_traind5, y_traind5)
tuned_50.score(X_testd5, y_testd5)

0.9766081871345029

In [440]:
cancer_shuffled6 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd6 = cancer_shuffled6[cancer_shuffled6.columns[1:10]].to_numpy()
bc_Yd6 = cancer_shuffled6[cancer_shuffled6.columns[10]].to_numpy()
X_traind6, X_testd6, y_traind6, y_testd6 = train_test_split(bc_Xd6, bc_Yd6, test_size = 0.5, random_state = 42)

tuned_50.fit(X_traind6, y_traind6)
tuned_50.score(X_testd6, y_testd6)

0.9649122807017544

In [702]:
(0.9707602339181286 + 0.9766081871345029 + 0.9649122807017544) / 3

0.9707602339181286

In [749]:
(tuned_50.score(X_traind6, y_traind6) + 
tuned_50.score(X_traind5, y_traind5) + 
tuned_50.score(X_traind4, y_traind4)) / 3

0.9804496578690127

#### Trial 3

In [441]:
cancer_shuffled7 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd7 = cancer_shuffled7[cancer_shuffled7.columns[1:10]].to_numpy()
bc_Yd7 = cancer_shuffled7[cancer_shuffled7.columns[10]].to_numpy()
X_traind7, X_testd7, y_traind7, y_testd7 = train_test_split(bc_Xd7, bc_Yd7, test_size = 0.5, random_state = 42)

tuned_50.fit(X_traind7, y_traind7)
tuned_50.score(X_testd7, y_testd7)

0.9619883040935673

## 80/20 Split

In [442]:
cancer_shuffled8 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd8 = cancer_shuffled8[cancer_shuffled8.columns[1:10]].to_numpy()
bc_Yd8 = cancer_shuffled8[cancer_shuffled8.columns[10]].to_numpy()
X_traind8, X_testd8, y_traind8, y_testd8 = train_test_split(bc_Xd8, bc_Yd8, test_size = 0.2, random_state = 42)

params = {'n_neighbors' : [3, 5, 7, 10]}

# have to scale data you ran into an error

# call classifier and get the best hyper parameter for this case - combinations of above params
classifier8 = KNeighborsClassifier(weights = 'uniform')
class_hyper_tuned8 = GridSearchCV(classifier8, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuned8.fit(X_traind8, y_traind8)
print("training score for best hyperparameter " + str(class_hyper_tuned8.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuned8.cv_results_.get('mean_test_score')[0]))
class_hyper_tuned8.best_estimator_

training score for best hyperparameter 0.9835164835164836
test score for best hyperparameter 0.9597069597069597


#### Trial 1

In [751]:
tuned_20 = KNeighborsClassifier(n_neighbors = 5, weights = 'uniform')
tuned_20.fit(X_traind8, y_traind8)
tuned_20.score(X_testd8, y_testd8)

0.8793103448275862

#### Trial 2

In [444]:
cancer_shuffled9 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd9 = cancer_shuffled9[cancer_shuffled9.columns[1:10]].to_numpy()
bc_Yd9 = cancer_shuffled9[cancer_shuffled9.columns[10]].to_numpy()
X_traind9, X_testd9, y_traind9, y_testd9 = train_test_split(bc_Xd9, bc_Yd9, test_size = 0.2, random_state = 42)

tuned_20.fit(X_traind9, y_traind9)
tuned_20.score(X_testd9, y_testd9)

0.9781021897810219

#### Trial 3

In [445]:
cancer_shuffled10 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
bc_Xd10 = cancer_shuffled10[cancer_shuffled10.columns[1:10]].to_numpy()
bc_Yd10 = cancer_shuffled10[cancer_shuffled10.columns[10]].to_numpy()
X_traind10, X_testd10, y_traind10, y_testd10 = train_test_split(bc_Xd10, bc_Yd10, test_size = 0.2, random_state = 42)

tuned_20.fit(X_traind10, y_traind10)
tuned_20.score(X_testd10, y_testd10)

0.9708029197080292

In [703]:
(0.9708029197080292 + 0.9781021897810219 + 0.9562043795620438) / 3

0.9683698296836983

In [752]:
(tuned_20.score(X_traind10, y_traind10) + 
tuned_20.score(X_traind9, y_traind9) + 
tuned_20.score(X_traind8, y_traind8)) / 3

0.9047619047619048

# SVM with Breast Cancer Data

## 80/20 Split

In [631]:
breast_cancer_df80 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X80 = breast_cancer_df80[breast_cancer_df80.columns[1:10]].to_numpy()
breast_Y80 = breast_cancer_df80[breast_cancer_df80.columns[10]].to_numpy()
X_trainb80, X_testb80, y_trainb80, y_testb80 = train_test_split(breast_X80, breast_Y80, test_size=0.2, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
# scaler_newSVM = StandardScaler()
# scaler_newSVM.fit(X_trainS)
# X_train_SVM = scaler.transform(X_trainS)


# call classifier and get the best hyper parameter for this case - combinations of above params
classifierb80 = SVC(class_weight = 'balanced')
class_hyper_tuneb80 = GridSearchCV(classifierb80, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuneb80.fit(X_trainb80, y_trainb80)
print("training score for best hyperparameter " + str(class_hyper_tuneb80.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuneb80.cv_results_.get('mean_test_score')[0]))
class_hyper_tuneb80.best_estimator_

training score for best hyperparameter 0.9725274725274725
test score for best hyperparameter 0.9725274725274725


In [632]:
#classifier
tuned_SVC80 = SVC(C = 0.01, class_weight = 'balanced', kernel = 'linear')
tuned_SVC80.fit(X_trainb80, y_trainb80)
tuned_SVC80.score(X_testb80, y_testb80)

0.9708029197080292

#### Trial 2

In [633]:
breast_cancer_df801 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X801 = breast_cancer_df801[breast_cancer_df801.columns[1:10]].to_numpy()
breast_Y801 = breast_cancer_df801[breast_cancer_df801.columns[10]].to_numpy()
X_trainb801, X_testb801, y_trainb801, y_testb801 = train_test_split(breast_X801, breast_Y801, test_size=0.2, random_state=42)
tuned_SVC80.fit(X_trainb801, y_trainb801)
tuned_SVC80.score(X_testb801, y_testb801)

0.9854014598540146

#### Trial 3

In [634]:
breast_cancer_df802 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X802 = breast_cancer_df802[breast_cancer_df802.columns[1:10]].to_numpy()
breast_Y802 = breast_cancer_df802[breast_cancer_df802.columns[10]].to_numpy()
X_trainb802, X_testb802, y_trainb802, y_testb802 = train_test_split(breast_X802, breast_Y802, test_size=0.2, random_state=42)
tuned_SVC80.fit(X_trainb802, y_trainb802)
tuned_SVC80.score(X_testb802, y_testb802)

0.948905109489051

In [704]:
(0.948905109489051 + 0.9854014598540146 + 0.9708029197080292) / 3

0.9683698296836983

In [753]:
(tuned_SVC80.score(X_trainb802, y_trainb802) + 
tuned_SVC80.score(X_trainb801, y_trainb801) + 
tuned_SVC80.score(X_trainb80, y_trainb80)) / 3

0.9713064713064713

## 50/50 Split

In [635]:
breast_cancer_df805 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X805 = breast_cancer_df805[breast_cancer_df805.columns[1:10]].to_numpy()
breast_Y805 = breast_cancer_df805[breast_cancer_df805.columns[10]].to_numpy()
X_trainb805, X_testb805, y_trainb805, y_testb805 = train_test_split(breast_X805, breast_Y805, test_size=0.5, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
# scaler_newSVM = StandardScaler()
# scaler_newSVM.fit(X_trainS)
# X_train_SVM = scaler.transform(X_trainS)


# call classifier and get the best hyper parameter for this case - combinations of above params
classifierb805 = SVC(class_weight = 'balanced')
class_hyper_tuneb805 = GridSearchCV(classifierb805, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuneb805.fit(X_trainb805, y_trainb805)
print("training score for best hyperparameter " + str(class_hyper_tuneb805.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuneb805.cv_results_.get('mean_test_score')[0]))
class_hyper_tuneb805.best_estimator_

training score for best hyperparameter 0.9692145194115981
test score for best hyperparameter 0.9618589245976298


In [637]:
#classifier
tuned_SVC801 = SVC(C = 1, class_weight = 'balanced', kernel = 'rbf')
tuned_SVC801.fit(X_trainb805, y_trainb805)
tuned_SVC801.score(X_testb805, y_testb805)

0.9649122807017544

#### Trial 2

In [638]:
breast_cancer_df806 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X806 = breast_cancer_df806[breast_cancer_df806.columns[1:10]].to_numpy()
breast_Y806 = breast_cancer_df806[breast_cancer_df806.columns[10]].to_numpy()
X_trainb806, X_testb806, y_trainb806, y_testb806 = train_test_split(breast_X806, breast_Y806, test_size=0.5, random_state=42)

tuned_SVC801.fit(X_trainb806, y_trainb806)
tuned_SVC801.score(X_testb806, y_testb806)

0.9824561403508771

#### Trial 3

In [639]:
breast_cancer_df807 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X807 = breast_cancer_df807[breast_cancer_df807.columns[1:10]].to_numpy()
breast_Y807 = breast_cancer_df807[breast_cancer_df807.columns[10]].to_numpy()
X_trainb807, X_testb807, y_trainb807, y_testb807 = train_test_split(breast_X807, breast_Y807, test_size=0.5, random_state=42)

tuned_SVC801.fit(X_trainb807, y_trainb807)
tuned_SVC801.score(X_testb807, y_testb807)

0.9619883040935673

In [705]:
(0.9619883040935673 + 0.9824561403508771 + 0.9649122807017544) / 3

0.969785575048733

In [754]:
(tuned_SVC801.score(X_trainb807, y_trainb807) +
tuned_SVC801.score(X_trainb806, y_trainb806) + 
tuned_SVC801.score(X_trainb805, y_trainb805)
) / 3

0.9726295210166178

## 20/80 Split

In [640]:
breast_cancer_df808 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X808 = breast_cancer_df808[breast_cancer_df808.columns[1:10]].to_numpy()
breast_Y808 = breast_cancer_df808[breast_cancer_df808.columns[10]].to_numpy()
X_trainb808, X_testb808, y_trainb808, y_testb808 = train_test_split(breast_X808, breast_Y808, test_size=0.8, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}
# scaler_newSVM = StandardScaler()
# scaler_newSVM.fit(X_trainS)
# X_train_SVM = scaler.transform(X_trainS)


# call classifier and get the best hyper parameter for this case - combinations of above params
classifierb808 = SVC(class_weight = 'balanced')
class_hyper_tuneb808 = GridSearchCV(classifierb808, param_grid = params, cv = 2, return_train_score = True)
class_hyper_tuneb808.fit(X_trainb808, y_trainb808)
print("training score for best hyperparameter " + str(class_hyper_tuneb808.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuneb808.cv_results_.get('mean_test_score')[0]))
class_hyper_tuneb808.best_estimator_

training score for best hyperparameter 0.9779411764705883
test score for best hyperparameter 0.9705882352941178


In [641]:
#classifier
tuned_SVC808 = SVC(C = 0.001, class_weight = 'balanced', kernel = 'linear')
tuned_SVC808.fit(X_trainb808, y_trainb808)
tuned_SVC808.score(X_testb808, y_testb808)

0.9652650822669104

#### Trial 2

In [642]:
breast_cancer_df809 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X809 = breast_cancer_df809[breast_cancer_df809.columns[1:10]].to_numpy()
breast_Y809 = breast_cancer_df809[breast_cancer_df809.columns[10]].to_numpy()
X_trainb809, X_testb809, y_trainb809, y_testb809 = train_test_split(breast_X809, breast_Y809, test_size=0.8, random_state=42)

tuned_SVC808.fit(X_trainb809, y_trainb809)
tuned_SVC808.score(X_testb809, y_testb809)

0.9670932358318098

#### Trial 3

In [643]:
breast_cancer_df810 = breast_cancer_df.sample(frac = 1).reset_index(drop = True)
breast_X810 = breast_cancer_df810[breast_cancer_df810.columns[1:10]].to_numpy()
breast_Y810 = breast_cancer_df810[breast_cancer_df810.columns[10]].to_numpy()
X_trainb810, X_testb810, y_trainb810, y_testb810 = train_test_split(breast_X810, breast_Y810, test_size=0.8, random_state=42)

tuned_SVC808.fit(X_trainb810, y_trainb810)
tuned_SVC808.score(X_testb810, y_testb810)

0.9597806215722121

In [706]:
(0.9597806215722121 + 0.9670932358318098 + 0.9652650822669104) / 3

0.9640463132236441

In [755]:
(tuned_SVC808.score(X_trainb809, y_trainb809) + 
tuned_SVC808.score(X_trainb810, y_trainb810) + 
tuned_SVC808.score(X_trainb808, y_trainb808)) / 3

0.9583333333333334

# Logistic Regression with Cervical Data

## Cleaning Data

In [490]:
cervical_cancer.isnull().sum()
cervical_cancer.shape[1]

20

In [492]:
cervical_cancer

Unnamed: 0,behavior_sexualRisk,behavior_eating,behavior_personalHygine,intention_aggregation,intention_commitment,attitude_consistency,attitude_spontaneity,norm_significantPerson,norm_fulfillment,perception_vulnerability,perception_severity,motivation_strength,motivation_willingness,socialSupport_emotionality,socialSupport_appreciation,socialSupport_instrumental,empowerment_knowledge,empowerment_abilities,empowerment_desires,ca_cervix
0,10,13,12,4,7,9,10,1,8,7,3,14,8,5,7,12,12,11,8,1
1,10,11,11,10,14,7,7,5,5,4,2,15,13,7,6,5,5,4,4,1
2,10,15,3,2,14,8,10,1,4,7,2,7,3,3,6,11,3,3,15,1
3,10,11,10,10,15,7,7,1,5,4,2,15,13,7,4,4,4,4,4,1
4,8,11,7,8,10,7,8,1,5,3,2,15,5,3,6,12,5,4,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,10,14,14,10,15,6,7,5,15,14,10,15,13,9,8,12,12,11,9,0
68,10,12,15,10,15,8,8,5,15,14,8,12,14,11,7,13,15,11,14,0
69,10,8,11,6,10,6,4,3,13,9,8,14,12,9,7,11,12,10,10,0
70,9,12,13,10,13,6,6,5,14,13,10,13,12,11,8,12,11,13,15,0


## 80/20 Split

In [501]:
cc_shuffled = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X = cc_shuffled[cc_shuffled.columns[:19]].to_numpy()
cc_Y = cc_shuffled[cc_shuffled.columns[19]].to_numpy()
X_traincc, X_testcc, y_traincc, y_testcc = train_test_split(cc_X, cc_Y, test_size = 0.2, random_state = 42)
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
scalercc = StandardScaler()
scalercc.fit(X_traincc)
X_train_newcc = scalercc.transform(X_traincc)

# call classifier and get the best hyper parameter for this case - combinations of above params
classier_cc = LogisticRegression(class_weight = 'balanced', max_iter = 1000)
classier_hyper_tunecc = GridSearchCV(classier_cc, param_grid = params, cv = 3, return_train_score = True)
classier_hyper_tunecc.fit(X_traincc, y_traincc)
print("training score for best hyperparameter " + str(classier_hyper_tunecc.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(classier_hyper_tunecc.cv_results_.get('mean_test_score')[0]))
classier_hyper_tunecc.best_estimator_

training score for best hyperparameter 0.8508771929824562
test score for best hyperparameter 0.8421052631578947


In [593]:
y_traincc

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0])

#### Trial 1

In [502]:
tuned_cc_LR = LogisticRegression(C = 0.1, class_weight = 'balanced', solver = 'lbfgs', max_iter = 1000)
tuned_cc_LR.fit(X_traincc, y_traincc)
tuned_cc_LR.score(X_testcc, y_testcc)

0.9333333333333333

#### Trial 2

In [503]:
cc_shuffled1 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X1 = cc_shuffled1[cc_shuffled1.columns[:19]].to_numpy()
cc_Y1 = cc_shuffled1[cc_shuffled1.columns[19]].to_numpy()
X_traincc1, X_testcc1, y_traincc1, y_testcc1 = train_test_split(cc_X1, cc_Y1, test_size = 0.2, random_state = 42)

tuned_cc_LR.fit(X_traincc1, y_traincc1)
tuned_cc_LR.score(X_testcc1, y_testcc1)

1.0

#### Trial 3

In [504]:
cc_shuffled2 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X2 = cc_shuffled2[cc_shuffled2.columns[:19]].to_numpy()
cc_Y2 = cc_shuffled2[cc_shuffled2.columns[19]].to_numpy()
X_traincc2, X_testcc2, y_traincc2, y_testcc2 = train_test_split(cc_X2, cc_Y2, test_size = 0.2, random_state = 42)

tuned_cc_LR.fit(X_traincc2, y_traincc2)
tuned_cc_LR.score(X_testcc2, y_testcc2)

0.8666666666666667

In [707]:
(1.0 + 0.9333333333333333 + 0.8666666666666667) / 3

0.9333333333333332

In [756]:
(tuned_cc_LR.score(X_traincc2, y_traincc2) + 
tuned_cc_LR.score(X_traincc1, y_traincc1) + 
tuned_cc_LR.score(X_traincc, y_traincc)) / 3

0.9824561403508771

## 50/50 Split

In [505]:
cc_shuffled50 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X50 = cc_shuffled50[cc_shuffled50.columns[:19]].to_numpy()
cc_Y50 = cc_shuffled50[cc_shuffled50.columns[19]].to_numpy()
X_traincc50, X_testcc50, y_traincc50, y_testcc50 = train_test_split(cc_X50, cc_Y50, test_size = 0.5, random_state = 42)
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
# scalercc = StandardScaler()
# scalercc.fit(X_traincc)
# X_train_newcc = scalercc.transform(X_traincc)

# call classifier and get the best hyper parameter for this case - combinations of above params
classier_cc50 = LogisticRegression(class_weight = 'balanced', max_iter = 1000)
classier_hyper_tunecc50 = GridSearchCV(classier_cc50, param_grid = params, cv = 3, return_train_score = True)
classier_hyper_tunecc50.fit(X_traincc50, y_traincc50)
print("training score for best hyperparameter " + str(classier_hyper_tunecc50.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(classier_hyper_tunecc50.cv_results_.get('mean_test_score')[0]))
classier_hyper_tunecc50.best_estimator_

training score for best hyperparameter 0.8888888888888888
test score for best hyperparameter 0.861111111111111


In [594]:
y_traincc50

array([0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0])

#### Trial 1

In [508]:
tuned_cc_LR1 = LogisticRegression(C = 1, class_weight = 'balanced', solver = 'lbfgs', max_iter = 1000)
tuned_cc_LR1.fit(X_traincc, y_traincc)
tuned_cc_LR1.score(X_testcc, y_testcc)

0.8666666666666667

#### Trial 2

In [510]:
cc_shuffled51 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X51 = cc_shuffled51[cc_shuffled51.columns[:19]].to_numpy()
cc_Y51 = cc_shuffled51[cc_shuffled51.columns[19]].to_numpy()
X_traincc51, X_testcc51, y_traincc51, y_testcc51 = train_test_split(cc_X51, cc_Y51, test_size = 0.5, random_state = 42)

tuned_cc_LR1.fit(X_traincc51, y_traincc51)
tuned_cc_LR1.score(X_testcc51, y_testcc51)

0.8611111111111112

#### Trial 3

In [512]:
cc_shuffled52 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X52 = cc_shuffled52[cc_shuffled52.columns[:19]].to_numpy()
cc_Y52 = cc_shuffled52[cc_shuffled52.columns[19]].to_numpy()
X_traincc52, X_testcc52, y_traincc52, y_testcc52 = train_test_split(cc_X52, cc_Y52, test_size = 0.5, random_state = 42)

tuned_cc_LR1.fit(X_traincc52, y_traincc52)
tuned_cc_LR1.score(X_testcc52, y_testcc52)

0.9166666666666666

In [708]:
(0.8666666666666667 + 0.8611111111111112 + 0.9166666666666666) / 3

0.8814814814814814

In [757]:
(tuned_cc_LR1.score(X_traincc52, y_traincc52) + 
tuned_cc_LR1.score(X_traincc51, y_traincc51) + 
tuned_cc_LR1.score(X_traincc, y_traincc)) / 3

0.9639376218323586

## 20/80 Split

In [513]:
cc_shuffled20 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X20 = cc_shuffled20[cc_shuffled20.columns[:19]].to_numpy()
cc_Y20 = cc_shuffled20[cc_shuffled20.columns[19]].to_numpy()
X_traincc20, X_testcc20, y_traincc20, y_testcc20 = train_test_split(cc_X20, cc_Y20, test_size = 0.8, random_state = 42)
params = {'C' : [0.001, 0.01, 0.1, 1], 'solver' : ['lbfgs', 'liblinear', 'newton-cg']}

# have to scale data you ran into an error
# scalercc = StandardScaler()
# scalercc.fit(X_traincc)
# X_train_newcc = scalercc.transform(X_traincc)

# call classifier and get the best hyper parameter for this case - combinations of above params
classier_cc20 = LogisticRegression(class_weight = 'balanced', max_iter = 1000)
classier_hyper_tunecc20 = GridSearchCV(classier_cc20, param_grid = params, cv = 3, return_train_score = True)
classier_hyper_tunecc20.fit(X_traincc20, y_traincc20)
print("training score for best hyperparameter " + str(classier_hyper_tunecc20.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(classier_hyper_tunecc20.cv_results_.get('mean_test_score')[0]))
classier_hyper_tunecc20.best_estimator_

training score for best hyperparameter 0.9296296296296296
test score for best hyperparameter 0.9333333333333332


In [595]:
y_traincc20

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0])

#### Trial 1

In [514]:
tuned_cc_LR2 = LogisticRegression(C = 0.01, class_weight = 'balanced', solver = 'liblinear', max_iter = 1000)
tuned_cc_LR2.fit(X_traincc20, y_traincc20)
tuned_cc_LR2.score(X_testcc20, y_testcc20)

0.7931034482758621

#### Trial 2

In [515]:
cc_shuffled21 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X21 = cc_shuffled21[cc_shuffled21.columns[:19]].to_numpy()
cc_Y21 = cc_shuffled21[cc_shuffled21.columns[19]].to_numpy()
X_traincc21, X_testcc21, y_traincc21, y_testcc21 = train_test_split(cc_X21, cc_Y21, test_size = 0.8, random_state = 42)

tuned_cc_LR2.fit(X_traincc21, y_traincc21)
tuned_cc_LR2.score(X_testcc21, y_testcc21)

0.896551724137931

#### Trial 3

In [516]:
cc_shuffled22 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X22 = cc_shuffled22[cc_shuffled22.columns[:19]].to_numpy()
cc_Y22 = cc_shuffled22[cc_shuffled22.columns[19]].to_numpy()
X_traincc22, X_testcc22, y_traincc22, y_testcc22 = train_test_split(cc_X22, cc_Y22, test_size = 0.8, random_state = 42)

tuned_cc_LR2.fit(X_traincc22, y_traincc22)
tuned_cc_LR.score(X_testcc22, y_testcc22)

0.9655172413793104

In [709]:
(0.7931034482758621 + 0.896551724137931 + 0.9655172413793104) / 3

0.8850574712643678

In [758]:
(tuned_cc_LR2.score(X_traincc22, y_traincc22) + 
tuned_cc_LR2.score(X_traincc21, y_traincc21) + 
tuned_cc_LR2.score(X_traincc20, y_traincc20)) / 3

0.880952380952381

# KNN Classifier with Cervical Cancer Data

## 80/20 Split

In [646]:
cc_shuffled_3 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X8 = cc_shuffled_3[cc_shuffled_3.columns[:19]].to_numpy()
cc_Y8 = cc_shuffled_3[cc_shuffled_3.columns[19]].to_numpy()
X_traind8, X_testd8, y_traind8, y_testd8 = train_test_split(cc_X8, cc_Y8, test_size = 0.8, random_state = 42)

params = {'n_neighbors' : [3, 5, 6, 7]}

# have to scale data you ran into an error

# call classifier and get the best hyper parameter for this case - combinations of above params
classify = KNeighborsClassifier(weights = 'uniform')
class_hyper_tunedy = GridSearchCV(classify, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunedy.fit(X_traind8, y_traind8)
print("training score for best hyperparameter " + str(class_hyper_tunedy.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunedy.cv_results_.get('mean_test_score')[0]))
class_hyper_tunedy.best_estimator_

training score for best hyperparameter 0.8592592592592592
test score for best hyperparameter 0.7833333333333333


In [647]:
y_traind8

array([1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1])

#### Trial 1

In [648]:
tuned_class = KNeighborsClassifier(n_neighbors = 3, weights = 'uniform')
tuned_class.fit(X_traind8, y_traind8)
tuned_class.score(X_testd8, y_testd8)

0.8793103448275862

#### Trial 2

In [649]:
cc_shuffled_4 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X9 = cc_shuffled_4[cc_shuffled_4.columns[:19]].to_numpy()
cc_Y9 = cc_shuffled_4[cc_shuffled_4.columns[19]].to_numpy()
X_traind9, X_testd9, y_traind9, y_testd9 = train_test_split(cc_X9, cc_Y9, test_size = 0.8, random_state = 42)

tuned_class.fit(X_traind9, y_traind9)
tuned_class.score(X_testd9, y_testd9)

0.8103448275862069

#### Trial 3

In [650]:
cc_shuffled_5 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X10 = cc_shuffled_5[cc_shuffled_5.columns[:19]].to_numpy()
cc_Y10 = cc_shuffled_5[cc_shuffled_5.columns[19]].to_numpy()
X_traind10, X_testd10, y_traind10, y_testd10 = train_test_split(cc_X10, cc_Y10, test_size = 0.8, random_state = 42)

tuned_class.fit(X_traind10, y_traind10)
tuned_class.score(X_testd10, y_testd10)

0.9137931034482759

In [710]:
(0.9137931034482759 + 0.8103448275862069 + 0.8793103448275862) / 3

0.867816091954023

In [760]:
(tuned_class.score(X_traind10, y_traind10) + 
tuned_class.score(X_traind9, y_traind9) + 
tuned_class.score(X_traind8, y_traind8)) / 3

0.8809523809523809

## 50/50 Split

In [651]:
cc_shuffled_5 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X501 = cc_shuffled_5[cc_shuffled_5.columns[:19]].to_numpy()
cc_Y501 = cc_shuffled_5[cc_shuffled_5.columns[19]].to_numpy()
X_traind501, X_testd501, y_traind501, y_testd501 = train_test_split(cc_X501, cc_Y501, test_size = 0.5, random_state = 42)

params = {'n_neighbors' : [3, 5, 6, 7]}

# have to scale data you ran into an error

# call classifier and get the best hyper parameter for this case - combinations of above params
classify1 = KNeighborsClassifier(weights = 'uniform')
class_hyper_tunedy1 = GridSearchCV(classify1, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunedy1.fit(X_traind501, y_traind501)
print("training score for best hyperparameter " + str(class_hyper_tunedy1.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunedy1.cv_results_.get('mean_test_score')[0]))
class_hyper_tunedy1.best_estimator_

training score for best hyperparameter 0.9027777777777777
test score for best hyperparameter 0.8611111111111112


#### Trial 1

In [652]:
tuned_class1 = KNeighborsClassifier(n_neighbors = 3, weights = 'uniform')
tuned_class1.fit(X_traind501, y_traind501)
tuned_class1.score(X_testd501, y_testd501)

0.9166666666666666

#### Trial 2

In [653]:
cc_shuffled_6 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X502 = cc_shuffled_6[cc_shuffled_6.columns[:19]].to_numpy()
cc_Y502 = cc_shuffled_6[cc_shuffled_6.columns[19]].to_numpy()
X_traind502, X_testd502, y_traind502, y_testd502 = train_test_split(cc_X502, cc_Y502, test_size = 0.5, random_state = 42)

tuned_class1.fit(X_traind502, y_traind502)
tuned_class1.score(X_testd502, y_testd502)

0.8055555555555556

#### Trial 3

In [654]:
cc_shuffled_7 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X503 = cc_shuffled_7[cc_shuffled_7.columns[:19]].to_numpy()
cc_Y503 = cc_shuffled_7[cc_shuffled_7.columns[19]].to_numpy()
X_traind503, X_testd503, y_traind503, y_testd503 = train_test_split(cc_X503, cc_Y503, test_size = 0.5, random_state = 42)

tuned_class1.fit(X_traind503, y_traind503)
tuned_class1.score(X_testd503, y_testd503)

0.8611111111111112

In [711]:
(0.9166666666666666 + 0.8055555555555556 + 0.8611111111111112) / 3

0.8611111111111112

In [761]:
(tuned_class1.score(X_traind503, y_traind503) + 
tuned_class1.score(X_traind502, y_traind502) + 
tuned_class1.score(X_traind501, y_traind501)) / 3

0.9166666666666666

## 20/80 Split

In [655]:
cc_shuffled_8 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X2080 = cc_shuffled_8[cc_shuffled_8.columns[:19]].to_numpy()
cc_Y2080 = cc_shuffled_8[cc_shuffled_8.columns[19]].to_numpy()
X_traind2080, X_testd2080, y_traind2080, y_testd2080 = train_test_split(cc_X2080, cc_Y2080, test_size = 0.8, random_state = 42)

params = {'n_neighbors' : [3, 5, 6, 7]}

# have to scale data you ran into an error

# call classifier and get the best hyper parameter for this case - combinations of above params
classify2 = KNeighborsClassifier(weights = 'uniform')
class_hyper_tunedy2 = GridSearchCV(classify2, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tunedy2.fit(X_traind2080, y_traind2080)
print("training score for best hyperparameter " + str(class_hyper_tunedy2.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tunedy2.cv_results_.get('mean_test_score')[0]))
class_hyper_tunedy2.best_estimator_

training score for best hyperparameter 1.0
test score for best hyperparameter 0.9166666666666666


#### Trial 1

In [657]:
tuned_class2 = KNeighborsClassifier(n_neighbors = 3, weights = 'uniform')
tuned_class2.fit(X_traind2080, y_traind2080)
tuned_class2.score(X_testd2080, y_testd2080)

0.8275862068965517

#### Trial 2

In [658]:
cc_shuffled_9 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X20801 = cc_shuffled_9[cc_shuffled_9.columns[:19]].to_numpy()
cc_Y20801 = cc_shuffled_9[cc_shuffled_9.columns[19]].to_numpy()
X_traind20801, X_testd20801, y_traind20801, y_testd20801 = train_test_split(cc_X20801, cc_Y20801, test_size = 0.8, random_state = 42)

tuned_class2.fit(X_traind20801, y_traind20801)
tuned_class2.score(X_testd20801, y_testd20801)

0.8448275862068966

#### Trial 3

In [659]:
cc_shuffled_10 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_X20802 = cc_shuffled_10[cc_shuffled_10.columns[:19]].to_numpy()
cc_Y20802 = cc_shuffled_10[cc_shuffled_10.columns[19]].to_numpy()
X_traind20802, X_testd20802, y_traind20802, y_testd20802 = train_test_split(cc_X20802, cc_Y20802, test_size = 0.8, random_state = 42)

tuned_class2.fit(X_traind20802, y_traind20802)
tuned_class2.score(X_testd20802, y_testd20802)

0.6379310344827587

In [712]:
(0.8275862068965517 + 0.8448275862068966 + 0.6379310344827587) / 3

0.7701149425287358

In [762]:
(tuned_class2.score(X_traind20802, y_traind20802) + 
tuned_class2.score(X_traind20801, y_traind20801) + 
tuned_class2.score(X_traind2080, y_traind2080)) / 3

0.880952380952381

# SVM with Cervical Cancer Data 

## 80/20 Split

In [611]:
cc_SVM = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_XSVM = cc_SVM[cc_SVM.columns[:19]].to_numpy()
cc_YSVM = cc_SVM[cc_SVM.columns[19]].to_numpy()
X_trainsvm, X_testsvm, y_trainsvm, y_testsvm = train_test_split(cc_XSVM, cc_YSVM, test_size=0.2, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}

classifierSVM = SVC(class_weight = 'balanced')
class_hyper_tuneSVM = GridSearchCV(classifierSVM, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuneSVM.fit(X_trainsvm, y_trainsvm)
print("training score for best hyperparameter " + str(class_hyper_tuneSVM.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuneSVM.cv_results_.get('mean_test_score')[0]))
class_hyper_tuneSVM.best_estimator_

training score for best hyperparameter 0.8508771929824562
test score for best hyperparameter 0.7543859649122807


In [612]:
#### Trial 1

In [613]:
tuned_param = SVC(class_weight = 'balanced', C = 0.1, kernel = 'linear')
tuned_param.fit(X_trainsvm, y_trainsvm)
tuned_param.score(X_testsvm, y_testsvm)

0.9333333333333333

In [614]:
#### Trial 2

In [615]:
cc_SVM1 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_XSVM1 = cc_SVM1[cc_SVM1.columns[:19]].to_numpy()
cc_YSVM1 = cc_SVM1[cc_SVM1.columns[19]].to_numpy()
X_trainsvm1, X_testsvm1, y_trainsvm1, y_testsvm1 = train_test_split(cc_XSVM1, cc_YSVM1, test_size=0.2, random_state=42)

tuned_param.fit(X_trainsvm1, y_trainsvm1)
tuned_param.score(X_testsvm1, y_testsvm1)

0.7333333333333333

In [616]:
#### Trial 3

In [617]:
cc_SVM2 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_XSVM2 = cc_SVM2[cc_SVM2.columns[:19]].to_numpy()
cc_YSVM2 = cc_SVM2[cc_SVM2.columns[19]].to_numpy()
X_trainsvm2, X_testsvm2, y_trainsvm2, y_testsvm2 = train_test_split(cc_XSVM2, cc_YSVM2, test_size=0.2, random_state=42)

tuned_param.fit(X_trainsvm2, y_trainsvm2)
tuned_param.score(X_testsvm2, y_testsvm2)

0.9333333333333333

In [713]:
(0.9333333333333333 + 0.7333333333333333 + 0.9333333333333333) / 3

0.8666666666666666

In [763]:
(tuned_param.score(X_trainsvm2, y_trainsvm2) + 
tuned_param.score(X_trainsvm1, y_trainsvm1) + 
tuned_param.score(X_trainsvm, y_trainsvm)) / 3

0.9883040935672515

## 50/50 Split

In [618]:
cc_SVM50 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_XSVM50 = cc_SVM50[cc_SVM50.columns[:19]].to_numpy()
cc_YSVM50 = cc_SVM50[cc_SVM50.columns[19]].to_numpy()
X_trainsvm50, X_testsvm50, y_trainsvm50, y_testsvm50 = train_test_split(cc_XSVM50, cc_YSVM50, test_size=0.5, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}

classifierSVM50 = SVC(class_weight = 'balanced')
class_hyper_tuneSVM50 = GridSearchCV(classifierSVM50, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuneSVM50.fit(X_trainsvm50, y_trainsvm50)
print("training score for best hyperparameter " + str(class_hyper_tuneSVM50.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuneSVM50.cv_results_.get('mean_test_score')[0]))
class_hyper_tuneSVM50.best_estimator_

training score for best hyperparameter 0.8055555555555555
test score for best hyperparameter 0.75


In [619]:
#### Trial 1

In [620]:
tuned_param1 = SVC(class_weight = 'balanced', C = 0.01, kernel = 'linear')
tuned_param1.fit(X_trainsvm50, y_trainsvm50)
scored = tuned_param1.score(X_testsvm50, y_testsvm50)
scored

0.8888888888888888

In [621]:
#### Trial 2

In [622]:
cc_SVM501 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_XSVM501 = cc_SVM501[cc_SVM501.columns[:19]].to_numpy()
cc_YSVM501 = cc_SVM501[cc_SVM501.columns[19]].to_numpy()
X_trainsvm501, X_testsvm501, y_trainsvm501, y_testsvm501 = train_test_split(cc_XSVM501, cc_YSVM501, test_size=0.5, random_state=42)

tuned_param1.fit(X_trainsvm501, y_trainsvm501)
scored1 = tuned_param1.score(X_testsvm501, y_testsvm501)
scored1

0.8888888888888888

In [623]:
#### Trial 3

In [624]:
cc_SVM502 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_XSVM502 = cc_SVM50[cc_SVM50.columns[:19]].to_numpy()
cc_YSVM502 = cc_SVM50[cc_SVM50.columns[19]].to_numpy()
X_trainsvm502, X_testsvm502, y_trainsvm502, y_testsvm502 = train_test_split(cc_XSVM502, cc_YSVM502, test_size=0.5, random_state=42)

tuned_param1.fit(X_trainsvm502, y_trainsvm502)
scored2 = tuned_param1.score(X_testsvm502, y_testsvm502)
scored2

0.8888888888888888

In [765]:
(scored + scored1 + scored2) / 3

0.8888888888888888

In [767]:
tuned_param1.fit(X_trainsvm502, y_trainsvm502)
print(tuned_param1.score(X_trainsvm502, y_trainsvm502))
tuned_param1.fit(X_trainsvm501, y_trainsvm501)
print(tuned_param1.score(X_trainsvm501, y_trainsvm501))
tuned_param1.fit(X_trainsvm50, y_trainsvm50)
print(tuned_param1.score(X_trainsvm50, y_trainsvm50))

1.0
0.9722222222222222
1.0


In [764]:
(tuned_param1.score(X_trainsvm502, y_trainsvm502) + 
tuned_param1.score(X_trainsvm501, y_trainsvm501) + 
tuned_param1.score(X_trainsvm50, y_trainsvm50)
) / 3

0.9814814814814815

## 20/80 Split

In [626]:
cc_SVM80 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_XSVM80 = cc_SVM80[cc_SVM80.columns[:19]].to_numpy()
cc_YSVM80 = cc_SVM80[cc_SVM80.columns[19]].to_numpy()
X_trainsvm80, X_testsvm80, y_trainsvm80, y_testsvm80 = train_test_split(cc_XSVM80, cc_YSVM80, test_size=0.8, random_state=42)

params = {'C': [0.001, 0.01, 0.1, 1], 'kernel':['linear', 'poly', 'sigmoid', "rbf"]}

classifierSVM80 = SVC(class_weight = 'balanced')
class_hyper_tuneSVM80 = GridSearchCV(classifierSVM80, param_grid = params, cv = 3, return_train_score = True)
class_hyper_tuneSVM80.fit(X_trainsvm80, y_trainsvm80)
print("training score for best hyperparameter " + str(class_hyper_tuneSVM80.cv_results_.get('mean_train_score')[0]))
print("test score for best hyperparameter " + str(class_hyper_tuneSVM80.cv_results_.get('mean_test_score')[0]))
class_hyper_tuneSVM80.best_estimator_

training score for best hyperparameter 0.8925925925925925
test score for best hyperparameter 0.6333333333333333


In [None]:
#### Trial 1

In [627]:
tuned_8020 = SVC(class_weight = 'balanced', C = 0.01, kernel = 'poly')
tuned_8020.fit(X_trainsvm80, y_trainsvm80)
tuned_8020.score(X_testsvm80, y_testsvm80)

0.8103448275862069

In [628]:
#### Trial 2

In [629]:
cc_SVM801 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_XSVM801 = cc_SVM801[cc_SVM801.columns[:19]].to_numpy()
cc_YSVM801 = cc_SVM801[cc_SVM801.columns[19]].to_numpy()
X_trainsvm801, X_testsvm801, y_trainsvm801, y_testsvm801 = train_test_split(cc_XSVM801, cc_YSVM801, test_size=0.8, random_state=42)

tuned_8020.fit(X_trainsvm801, y_trainsvm801)
tuned_8020.score(X_testsvm801, y_testsvm801)

0.6724137931034483

In [None]:
#### Trial 3

In [769]:
cc_SVM802 = cervical_cancer.sample(frac = 1).reset_index(drop = True)
cc_XSVM802 = cc_SVM802[cc_SVM802.columns[:19]].to_numpy()
cc_YSVM802 = cc_SVM802[cc_SVM802.columns[19]].to_numpy()
X_trainsvm802, X_testsvm802, y_trainsvm802, y_testsvm802 = train_test_split(cc_XSVM802, cc_YSVM802, test_size=0.8, random_state=42)

tuned_8020.fit(X_trainsvm802, y_trainsvm802)
tuned_8020.score(X_testsvm802, y_testsvm802)

0.7068965517241379

In [770]:
(0.8103448275862069 + 0.6724137931034483 + 0.6206896551724138) / 3

0.7011494252873564

In [772]:
(tuned_8020.score(X_trainsvm802, y_trainsvm802) + 
tuned_8020.score(X_trainsvm80, y_trainsvm80) + 
tuned_8020.score(X_trainsvm801, y_trainsvm801)) / 3

0.9761904761904763