# Read the Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('xtrain.txt', header = None, sep = '\t')
df.set_index(0, inplace = True)
df = df.transpose()

# Read the y values
df_y = pd.read_csv('ytrain.txt', header = None)
df_y.index = 1+np.arange(184)

# Concatenate both
df['y'] = df_y[0:]

In [3]:
df = df.astype({'y':'category'})
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184 entries, 1 to 184
Columns: 4655 entries, x10006_at to y
dtypes: category(1), float64(4654)
memory usage: 6.5 MB


In [4]:
# Save the data
df.to_csv('breast_genes.csv', index = False)

# Applying Different Models

In [5]:
x = df.drop(['y'], axis = 1).values
y = np.array(df.y.values)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [6]:
# Random Forest with default number of estimators
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.68852459016393441

In [7]:
# RandomForest with 100 estimators
rf = RandomForestClassifier(n_estimators= 100, random_state=42)
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.62295081967213117

In [8]:
# SVM with default parameters
clf = SVC(random_state=42)
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))
len(clf.support_vectors_)

0.622950819672


120

In [9]:
# Logistic Regression with default parameters
lr = LogisticRegression(random_state=42)
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

0.73770491803278693

In [10]:
# Logistic Regression with C = 0.01
# C : Inverse of regularization strength; must be a positive float. 
# Like in support vector machines, smaller values specify stronger regularization.

lr = LogisticRegression(random_state=42, C = 0.01)
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

0.75409836065573765

In [11]:
# KNN with default params
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn.score(x_test, y_test)

0.52459016393442626

In [12]:
# SVM with polynomial kernel
clf = SVC(random_state=42 , C = 10, kernel = 'poly') # (sigmoid 62) and (rbf 65), (poly 70)
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))
len(clf.support_vectors_)

0.704918032787


112

# Hyperparams Tuning for SVM

In [13]:
from sklearn.model_selection import GridSearchCV

def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10, 100]
#     gammas = [0.001, 0.01, 0.1, 1]
    kernels = ['linear', 'poly', 'rbf', 'sigmoid']
    degrees = [2, 3, 4, 5]
    param_grid = {'C': Cs, 'kernel' : kernels, 'degree': degrees}
    grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [14]:
# takes 4 min to run
# best_params = svc_param_selection(x, y, 10)
# best_params # were {'C': 10, 'degree': 2, 'kernel': 'rbf'}

In [15]:
clf = SVC(random_state=42 , C = 10, kernel = 'rbf')
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))
len(clf.support_vectors_)

0.655737704918


121

In [16]:
clf = SVC(random_state=42 , C = 10, kernel = 'poly', degree = 7)
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))
len(clf.support_vectors_)

0.754098360656


115