In [2]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler


In [1]:
JOBS = 1
def run_experiment(X, y, pipe, classifier):
    # Perform Stratified k-fold cross validation and grid search
    skf = StratifiedKFold(n_splits=5)
    params=param_selection(classifier)
    grid_search = GridSearchCV(pipe, params, cv=skf, n_jobs=JOBS,verbose=2)
    grid_search.fit(X, y)
    return grid_search

def build_pipeline(classifier):
    # Build a pipeline with the classifier
    
    pipes = {'pipe_lr': Pipeline([('scl', StandardScaler(with_mean=False)),
                    ('LR', LogisticRegression(random_state=42))]),
            'pipe_dt': Pipeline([('scl', StandardScaler(with_mean=False)),
                    ('DT',DecisionTreeClassifier(random_state=42))]),
            'pipe_rf': Pipeline([('scl', StandardScaler(with_mean=False)),
                    ('RF',RandomForestClassifier(random_state=42))]),
            'pipe_knn': Pipeline([('scl', StandardScaler(with_mean=False)),
                    ('KNN', KNeighborsClassifier())]),
            'pipe_svm': Pipeline([('scl', StandardScaler(with_mean=False)),
                     ('SVM', SVC(random_state=42))]),
            'pipe_mnb': Pipeline([('scl', StandardScaler(with_mean=False)),
                     ('MNB', MultinomialNB())])
            }
    
    clf="pipe_"+classifier
    pipeline = pipes[clf]
    return pipeline

def param_selection(classifier):
    param_range = range(1,10)
    param_range_fl = [1.0, 0.1, 0.01]
    n_estimators = [50,100]
    learning_rates = [.1,.2,.3,0.01]
    class_weights = [{0:0.6,1:0.4},'balanced',{1:0.6,0:0.4}]

    params_list = {'params_lr':{'LR__penalty': ['l1', 'l2'],
                       'LR__C': param_range_fl,
                       'LR__solver': ['liblinear','lbfgs'],
                        'LR__class_weight': class_weights},

                   'params_dt':{'DT__criterion': ['gini', 'entropy'],
                       'DT__min_samples_leaf': param_range,
                       'DT__max_depth': param_range,
                       'DT__min_samples_split': param_range[1:]},

                   'params_rf': {'RF__min_samples_leaf': param_range,
                       'RF__max_depth': param_range,
                       'RF__min_samples_split': param_range[1:]},

                   'params_knn': {'KNN__n_neighbors': param_range,
                       'KNN__weights': ['uniform', 'distance'],
                       'KNN__metric': ['euclidean', 'manhattan']},

                   'params_svm': {'SVM__kernel': ['linear','sigmoid','poly','rbf'], 
                        #'SVM__C': param_range,
                        'SVM__gamma' : ['scale','auto'],
                        #'SVM__coef0' : param_range_fl,
                        'SVM__class_weight': class_weights},

                   'params_mnb':{'MNB__force_alpha':[True]}

                  }
    return params_list["params_"+classifier]


In [23]:
# More experiments saved here just in case

from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier

# Sum of squared distances between clustered points
ssd = []

# Values of K to try
k_values = range(1, 7)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(x_train)
    ssd.append(kmeans.inertia_)

    
# Plot the scree plot
plt.plot(k_values, ssd, marker='o')
plt.title('Scree Plot')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Sum of Squared Distances (SSD)')
plt.show()

k_range = range(1, 5)

scores = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    scores.append(accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test,y_pred))
