# Read the Data

In [1]:
from sklearn.feature_selection import mutual_info_classif, f_classif, SelectFromModel
import warnings
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.manifold import Isomap
import matplotlib.pyplot as plt
from sklearn import preprocessing
from collections import Counter
from sklearn.datasets import make_classification
# doctest: +NORMALIZE_WHITESPACE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

%matplotlib inline

warnings.filterwarnings('ignore')

In [2]:
df_main = pd.read_csv('xtrain.txt', header = None, sep = '\t', index_col = 0)
df_test = pd.read_csv('xtest.txt', header = None, sep = '\t', index_col = 0)
# df_main.set_index(0, inplace = True)
df_main = df_main.transpose()
df_test = df_test.transpose()
df_main.head()

# Read the y values
df_y = pd.read_csv('ytrain.txt', header = None)
df_y.index = 1+np.arange(184)

df_y.iloc[:,0].value_counts()

-1    118
 1     66
Name: 0, dtype: int64

In [None]:
# Save the data
# df_main.to_csv('breast_genes.csv', index = False)

# Handle class imbalance

In [3]:
print('Original dataset shape %s' % Counter(df_y.values.reshape(-1,)))
rus = SMOTE(random_state=42)
x_res, y_res = rus.fit_resample(df_main.values, df_y.values)

print('Original dataset shape %s' % Counter(y_res))

Original dataset shape Counter({-1: 118, 1: 66})
Original dataset shape Counter({-1: 118, 1: 118})


# Feature Selection with Lasso

In [4]:
lass = Lasso(
        max_iter=3000,
        alpha= 0.00281,
        random_state= 42 
        )
lass.fit(df_main, df_y)
lass.coef_.min(), lass.coef_.max()

# selected features
good_features = np.abs(np.array(lass.coef_)) > 0


# df_lasso = df_main.iloc[:, good_features]
df_lasso = x_res[:, good_features]
# df_y.index = 1+np.arange(184)

# df_all = df_lasso.copy()
# df_all['y'] = df_y[0:]
# df_all.to_csv('breast_genes_lasso_all.csv', index = False)

In [5]:
df_lasso.shape

(236, 184)

In [6]:
# x = df_lasso.iloc[:,0:184]
# y = np.array(df_y.values).reshape(-1,)

x = df_lasso[:, 0:184]
y = y_res
scaler = preprocessing.StandardScaler()
x = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, shuffle=True)
x.shape

(236, 184)

# Applying Different Models

In [8]:
def train(model, xtrain, ytrain, xtest, ytest):
    
    model.fit(xtrain, ytrain)
    pred = model.predict(xtest)
    
    kfold = KFold(n_splits=10)#, random_state=42)
    result = cross_val_score(model, xtrain, ytrain, cv=kfold, scoring='balanced_accuracy')
    print(f'Accuracy =  {model.score(xtest, ytest):.4} - ROC = {roc_auc_score(ytest, pred):.4} - CV mean = {result.mean():.4}')
    

In [None]:
# Random Forest with default number of estimators
rf = RandomForestClassifier(random_state=42)
train(rf, x_train, y_train, x_test, y_test)

In [None]:
lgb = LGBMClassifier()
train(lgb, x_train, y_train, x_test, y_test)

In [None]:
# RandomForest with 100 estimators
rf = RandomForestClassifier(n_estimators= 100, random_state=42)
train(rf, x_train, y_train, x_test, y_test)

In [None]:
# SVM with default parameters
clf = SVC(random_state=42)
train(clf, x_train, y_train, x_test, y_test)

In [None]:
# SVM with default parameters
clf = LinearSVC(random_state=42)
train(clf, x_train, y_train, x_test, y_test)

In [None]:
# Logistic Regression with default parameters
lr = LogisticRegression(random_state=42)
train(lr, x_train, y_train, x_test, y_test)

In [None]:
# Logistic Regression with dual form 
lr = LogisticRegression(random_state=42, dual =True) # best one till now
train(lr, x_train, y_train, x_test, y_test)

In [None]:
# Logistic Regression with C = 0.01
# C : Inverse of regularization strength; must be a positive float. 
# Like in support vector machines, smaller values specify stronger regularization.

lr = LogisticRegression(random_state=42, C = 100)
train(lr, x_train, y_train, x_test, y_test)

In [None]:
# KNN with default params
knn = KNeighborsClassifier()
train(knn, x_train, y_train, x_test, y_test)

In [None]:
# SVM with polynomial kernel
clf = SVC(random_state=42 , C = 10, kernel = 'poly') # (sigmoid 62) and (rbf 65), (poly 70)
train(clf, x_train, y_train, x_test, y_test)

# Hyperparams Tuning for SVM

## SVM

In [None]:
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10, 100]
#     gammas = [0.001, 0.01, 0.1, 1]
    kernels = ['linear', 'poly', 'rbf', 'sigmoid']
    degrees = [2, 3, 4, 5]
    param_grid = {'C': Cs, 'kernel' : kernels, 'degree': degrees}
    grid_search = GridSearchCV(SVC(), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [None]:
# takes 4 min to run
best_params = svc_param_selection(x_train, y_train, 10)
best_params # were {'C': 0.01, 'degree': 2, 'kernel': 'linear'}

In [None]:
clf = SVC(random_state=42 , C= 0.1, kernel= 'linear')
train(clf, x_train, y_train, x_test, y_test)

In [None]:
clf = SVC(random_state=42 , C = 10, kernel = 'poly', degree = 5) 
train(clf, x_train, y_train, x_test, y_test)

## Logistic Regression

In [None]:
def lr_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10, 100]
    dual = [False]
#     solvers = [ 'liblinear'] 
    solvers = ['lbfgs', 'liblinear', 'sag', 'saga']
    reg = ['l1', 'l2']
    param_grid = {'C': Cs ,
                  'dual': dual,
                  'solver': solvers
                 }

    grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_


In [None]:
best_lr_params = lr_param_selection(x_train, y_train, 10)
best_lr_params # were {'C': 0.001, 'dual': False, 'solver': 'newton-cg'}

In [None]:
lr = LogisticRegression(random_state=42, C = 100, dual= False)
# lr = LogisticRegression(random_state=42, C = 100, dual= False, solver='lbfgs')
train(lr, x_train, y_train, x_test, y_test)

# Genetic Algorithm for choosing the best pipeline

In [None]:
from tpot import TPOTClassifier

In [None]:
# auto_ml = TPOTClassifier(generations = 100, population_size = 100, scoring = 'balanced_accuracy', n_jobs = 4, random_state = 42, warm_start = True, verbosity = 2)
# auto_ml.fit(x_train, y_train)

In [None]:
# auto_ml.export('tpot_exported_pipeline.py')

In [9]:
# Best pipeline: 
lsvm = LinearSVC(C=1) #, dual=False, loss="squared_hinge", penalty="l2", tol=1e-05)
lsvm.fit(x_train, y_train)
train(lsvm, x_train, y_train.reshape(-1,), x_test, y_test.reshape(-1,))

# y_final = lsvm.predict(df_test.iloc[:, good_features])
# y_final
# with open('ytest.txt', 'w') as out_file:
#     for line in y_final:
#         out_file.write(str(line)+'\r')

Accuracy =  0.9872 - ROC = 0.9861 - CV mean = 0.9648


In [10]:
y_real_valued = df_test.iloc[:, good_features] @ lsvm.coef_.T
y_real_valued = y_real_valued.values.reshape(-1)
with open('ytest_real_value_smote.txt', 'w') as out_file:
    for line in y_real_valued:
        out_file.write(str(line)+'\n')

# Visualization

 ## PCA

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(x)
print(pca_result.shape)
df = df_main.copy()
df['pca-one'] = pca_result[:,0]
df['pca-two'] = pca_result[:,1] 
df['pca-three'] = pca_result[:,2]

plt.scatter(df['pca-one'], df['pca-two'], c = df_y.values.reshape(-1,))

In [None]:
from mpl_toolkits.mplot3d import axes3d

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax = fig.gca(projection='3d')
ax.scatter(df['pca-one'], df['pca-two'], df['pca-three'], alpha=0.8, c=df_y.values.reshape(-1,), edgecolors='none', s=30)

## t-SNE

In [None]:
n_sne = 7000

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(x)

plt.scatter(tsne_results[:,0], tsne_results[:,1], c = df_y.values.reshape(-1,))

## ISOMAP

In [None]:
n_neighbors = 10
n_components = 2
y_iso = Isomap(n_neighbors, n_components).fit_transform(x)

ax = fig.add_subplot(111)
plt.scatter(y_iso[:, 0], y_iso[:, 1], c=y.reshape(-1,))#, cmap=plt.cm.Spectral)

plt.axis('tight')
plt.show()

# Feature selection trials

## F-Test

F-Test is useful in feature selection as we get to know the significance of each feature in improving the model.

In [None]:
F, pval = f_classif(x, y)
idx = np.argsort(-F)
idx

In [None]:
df_new = df_lasso.iloc[:,idx[0:50]]
x_red = df_new.values
y_red = np.array(df_y.values)

x_train_red, x_test_red, y_train_red, y_test_red = train_test_split(x_red, y_red, test_size=0.33, random_state=42)

In [None]:
lr = LogisticRegression(random_state=42, dual =True) # best one till now
train(lr, x_train_red, y_train_red, x_test_red, y_test_red)

xgb = XGBClassifier(learning_rate=0.1, n_estimators=100)
train(xgb, x_train_red, y_train_red, x_test_red, y_test_red)

lsvm = LinearSVC(C=1)
lsvm.fit(x_train, y_train)
train(lsvm, x_train, y_train.reshape(-1,), x_test, y_test.reshape(-1,))

In [None]:
# best_params = svc_param_selection(x_train_red, y_train_red, 10)
# best_params # were {'C': 0.001, 'degree': 5, 'kernel': 'poly'}
clf = SVC(C = 0.001, kernel = 'poly', degree = 5)
train(clf, x_train_red, y_train_red, x_test_red, y_test_red)

In [None]:
num_feats = np.linspace(3, 50, 30)

for f in num_feats:
    df_new = df_main.iloc[:,idx[0:int(f)]]
    df_new['y'] = df_y[0:]

    x_red = df_new.drop(['y'], axis = 1).values
    y_red = np.array(df_new.y.values)

    x_train_red, x_test_red, y_train_red, y_test_red = train_test_split(x, y, test_size=0.33, random_state=42)

    lr = LogisticRegression(random_state=42, dual =True) # best one till now
    train(lr, x_train_red, y_train_red, x_test_red, y_test_red)

## Mututal Information

In [None]:
mi = mutual_info_classif(x, y)

In [None]:
mi.shape

In [None]:
idx_mi = np.argsort(-mi)
idx_mi

In [None]:
df_new = df_main.iloc[:,idx_mi[0:100]]
df_new['y'] = df_y[0:]

x_red = df_new.drop(['y'], axis = 1).values
y_red = np.array(df_new.y.values)

x_train_red, x_test_red, y_train_red, y_test_red = train_test_split(x_red, y_red, test_size=0.33, random_state=42)

lr = LogisticRegression(random_state=42, dual =True) # best one till now
train(lr, x_train_red, y_train_red, x_test_red, y_test_red)
kfold = KFold(n_splits=10, random_state=42)
result = cross_val_score(lr, x_train_red, y_train_red, cv=kfold, scoring='balanced_accuracy')
result.mean()

## Chi

In [None]:
from sklearn.feature_selection import chi2

In [None]:
chi_scores, _ = chi2(x, y)

idx_chi = np.argsort(-chi_scores)
df_new = df_main.iloc[:,idx_chi[0:100]]
df_new['y'] = df_y[0:]

x_red = df_new.drop(['y'], axis = 1).values
y_red = np.array(df_new.y.values)

x_train_red, x_test_red, y_train_red, y_test_red = train_test_split(x_red, y_red, test_size=0.33, random_state=42)

lr = LogisticRegression(random_state=42, dual =True) # best one till now
train(lr, x_train_red, y_train_red, x_test_red, y_test_red)

kfold = KFold(n_splits=10, random_state=42)
result = cross_val_score(lr, x_train_red, y_train_red, cv=kfold, scoring='balanced_accuracy')
result.mean()

## Variance Threshold

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
sel = VarianceThreshold()
selected_feats = sel.fit_transform(x)

In [None]:
selected_feats.shape

## Other

In [None]:
# clf = Pipeline([
#   ('feature_selection', SelectFromModel(LassoCV())),
#   ('classification', RandomForestClassifier())
# ])

ls = LassoCV()
ls.fit(x_train, y_train)
train(ls, x_train, y_train, x_test, y_test)

# train(lr, x_train_red, y_train_red, x_test_red, y_test_red)

In [None]:
# from sklearn.feature_selection import RFECV
# model = LogisticRegression() 
# rfe = RFECV(model, 3)

# fit = rfe.fit(x_train, y_train)

In [None]:
# fit.min_features_to_select

In [None]:
from sklearn.linear_model import SGDClassifier
tuned_parameters = {'alpha': [10 ** a for a in range(-6, -2)]}
sgd = SGDClassifier(loss='log', penalty='elasticnet',l1_ratio=0.15, n_iter=5, shuffle=True, verbose=False, n_jobs=10, average=False, class_weight='balanced')
clf = GridSearchCV(SGDClassifier('log')
                  , tuned_parameters, cv=10, scoring='balanced_accuracy')

#now clf is the best classifier found given the search space
train(clf, x_train_red, y_train_red, x_test_red, y_test_red)
#you can find the best alpha here
print(clf.best_params_)   


In [None]:
# sgd.fit(x_train_red, y_train_red)
# sgd.coef_.shape
# np.argsort(-sgd.coef_)

# Feature Engineering Trials

In [None]:
import featuretools as ft

In [None]:
df = df.reset_index()

In [None]:
es = ft.EntitySet(id='main_df')

In [None]:
es = es.entity_from_dataframe(entity_id= 'main_df', dataframe=df)#, make_index=False)