In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

###########################################################################
# TRY SUBMITTING 
# PCA and Variation for SVC

# FOR REGRESSION
# Ridge Elastic Lasso Tuned

In [2]:
# Load training and testing data
X_train = np.loadtxt('X_train.csv', delimiter=',', skiprows=1)
X_test = np.loadtxt('X_test.csv', delimiter=',', skiprows=1)
y_train = np.loadtxt('y_train.csv', delimiter=',', skiprows=1)[:, 1]

In [3]:
# Arrange answer in two columns. First column (with header "Id") is an
# enumeration from 0 to n-1, where n is the number of test points. Second
# column (with header "EpiOrStroma" is the predictions.
def saveFile(y_pred,name):
    test_header = "Id,EpiOrStroma"
    n_points = X_test.shape[0]
    y_pred_pp = np.ones((n_points, 2))
    y_pred_pp[:, 0] = range(n_points)
    y_pred_pp[:, 1] = y_pred
    np.savetxt(name, y_pred_pp, fmt='%d', delimiter=",",
               header=test_header, comments="")

In [36]:
#Feature Selection (all models)
'''
bestfeatures = SelectKBest(score_func=chi2, k=50)
fit = bestfeatures.fit(X_train,y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(50,'Score')) 
'''


#Feature Importance
model = ExtraTreesClassifier()
model.fit(X_train,y_train)
feat_scores = model.feature_importances_
#print(feat_scores) 

feat_indexes = np.argsort(feat_scores)
top_indexes = feat_indexes[:85]
print(feat_indexes)
print(top_indexes)


new_X_train = np.empty([len(X_train),len(top_indexes)])
new_X_test = np.empty([len(X_test),len(top_indexes)])
#new_y_train = np.empty([len(y_train)])

#print(X_train.shape, y_train.shape)
#print(new_X_train.shape, new_y_train.shape)
#print(X_train.shape[0],y_train.shape[0])
#print(new_X_train.shape[0],new_y_train.shape[0])

counter = 0
for index in top_indexes:
    new_X_train[:,counter] = X_train[:,index]
    new_X_test[:,counter] = X_test[:,index]
    #new_y_train[counter] = y_train[index]
    counter += 1



[ 79  52  85 106  17   1  73  16 100  15  71  89  75  55  62  20   6  45
  88  54  50  33   7  65  41 109  72  57  74  83  98 104  34  23  14 105
  44  61 108   8  35  22  37  70  58  40  92  21  27  69  66  87  13  80
  78  95  68  43  90  63   3  64  49  82  39 101   5  53   4  25  77  32
  12  86  84  76 103  26  81  51  94  59  67   2 110  48  38  19  42  29
  56  31  46  30  99  11  91   9  47  10  28  97  96  93 111  36  24 107
  60 102   0  18]
[ 79  52  85 106  17   1  73  16 100  15  71  89  75  55  62  20   6  45
  88  54  50  33   7  65  41 109  72  57  74  83  98 104  34  23  14 105
  44  61 108   8  35  22  37  70  58  40  92  21  27  69  66  87  13  80
  78  95  68  43  90  63   3  64  49  82  39 101   5  53   4  25  77  32
  12  86  84  76 103  26  81  51  94  59  67   2 110]


In [19]:
#Remove Features with Low Variance (all models)
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_train2 = sel.fit_transform(X_train)
X_test2 = sel.fit_transform(X_test)

print(X_train2.shape, X_train.shape)
print(X_test2.shape, X_test.shape)

(200, 82) (200, 112)
(798, 82) (798, 112)


In [25]:
# PCA 
pca = PCA(n_components=90)
X_train3 = pca.fit_transform(X_train)
X_test3 = pca.fit_transform(X_test)

print(X_train3.shape, X_train.shape)
print(X_test3.shape, X_test.shape)

(200, 90) (200, 112)
(798, 90) (798, 112)


In [80]:
# Linear Regression ##############################################################################
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
print("\nLinear Regression:",reg.score(X_train, y_train),"\n(Kaggle: 0.36820)")
saveFile(y_pred,'basicLinReg.csv')

reg2 = LinearRegression().fit(new_X_train, y_train)
y_pred = reg2.predict(new_X_test)
print("Linear Regression (Feat. Selection):",reg2.score(new_X_train, y_train),"\n(Kaggle: 0.)")
# 60 Features: 0.6965845503795097 
# 80 Features: 0.7746426562345887 
# 90 Features: 0.7886728742183744 
# 100 Features: 0.8228482666617984 
# 110 Features: 0.8436246969455491 
reg3 = LinearRegression().fit(X_train2, y_train)
y_pred = reg3.predict(X_test2)
print("Linear Regression (Feat. Selection 2):",reg3.score(X_train2, y_train),"\n(Kaggle: 0.)")
# Variance 0.90: 0.8085832361913172 
# Variance 0.80: 0.7926576460719514 
reg4 = LinearRegression().fit(X_train3, y_train)
y_pred = reg4.predict(X_test3)
print("Linear Regression (PCA):",reg4.score(X_train3, y_train),"\n(Kaggle: 0.)")
# 80 Features: 0.7917244508641572 
# 90 Features: 0.8064496058438628 
# 100 Features: 0.8412367324820228 
# 110 Features: 0.8495380381821245 

#saveFile(y_pred,'basicLinReg.csv')


# Transformed Linear Regression ###################################################################
regressor = LinearRegression()
transformer = QuantileTransformer(output_distribution='normal')
reg = TransformedTargetRegressor(regressor=regressor, transformer=transformer)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print("\nTransforemed Linear Regression:",reg.score(X_train, y_train),"\n(Kaggle: 0.62761)")
saveFile(y_pred,'basicLinRegTrans.csv')

reg2 = TransformedTargetRegressor(regressor=regressor, transformer=transformer)
reg2.fit(new_X_train, y_train)
y_pred = reg2.predict(new_X_test)
print("Transforemed Linear Regression (Feat. Selection):",reg2.score(new_X_train, y_train),"\n(Kaggle: 0.)")
# 60 Features: 0.6558356108917907 
# 80 Features: 0.8785302156088672 
# 90 Features: 0.8482536955108613 
# 100 Features: 0.9190201437392449 
# 110 Features: 0.9797550359348112 

reg3 = TransformedTargetRegressor(regressor=regressor, transformer=transformer)
reg3.fit(X_train2, y_train)
y_pred = reg3.predict(X_test2)
print("Transforemed Linear Regression (Feat. Selection 2):",reg3.score(X_train2, y_train),"\n(Kaggle: 0.)")
# Variance 0.90: 0.9190201437392449 
# Variance 0.80: 0.9190201437392449 
reg4 = TransformedTargetRegressor(regressor=regressor, transformer=transformer)
reg4.fit(X_train3, y_train)
y_pred = reg4.predict(X_test3)
print("Transforemed Linear Regression (PCA):",reg4.score(X_train3, y_train),"\n(Kaggle: 0.)")
# 80 Features: 0.9190201437392449 
# 90 Features: 0.9392651078044336 
# 100 Features: 0.9797550359348112 
# 110 Features: 0.9595100718696224

#saveFile(y_pred,'LinRegTransFeatSelect.csv')

# Logistic Regression ############################################################################
clf = LogisticRegression()
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("\nLogistic Regression:",clf.score(X_train, y_train),"\n(Kaggle: 0.59832)")

clf2 = LogisticRegression()
clf2 = clf2.fit(new_X_train, y_train)
y_pred = clf2.predict(new_X_test)
print("Logistic Regression (Feat. Selection):",clf2.score(new_X_train, y_train),"\n(Kaggle: 0.)")

clf3 = LogisticRegression()
clf3 = clf3.fit(X_train2, y_train)
y_pred = clf3.predict(X_test2)
print("Logistic Regression (Feat. Selection 2):",clf3.score(X_train2, y_train),"\n(Kaggle: 0.)")

clf4 = LogisticRegression()
clf4 = clf4.fit(X_train3, y_train)
y_pred = clf4.predict(X_test3)
print("Logistic Regression (PCA):",clf4.score(X_train3, y_train),"\n(Kaggle: 0.)")
# 80 Features:  0.835
# 90 Features: 0.835 
# 100 Features: 0.825 
# 110 Features: 0.835


# Ridge Regression ################################################################################
reg = linear_model.Ridge()
reg = reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print("\nRidge Regression:",reg.score(X_train, y_train),"\n(Kaggle: 0.42677)")
saveFile(y_pred,'RidgeReg.csv')

reg2 = linear_model.Ridge()
reg2 = reg2.fit(new_X_train, y_train)
y_pred = reg2.predict(new_X_test)
print("Ridge Regression (Feat. Selection):",reg2.score(new_X_train, y_train),"\n(Kaggle: 0.)")
# 60 Features: 0.678622066513977 
# 80 Features: 0.7392222257479466 
# 90 Features: 0.7576009814592887 
# 100 Features: 0.778140536752562 
# 110 Features: 0.7939592598975257

reg3 = linear_model.Ridge()
reg3 = reg3.fit(X_train2, y_train)
y_pred = reg3.predict(X_test2)
print("Ridge Regression (Feat. Selection) 2:",reg3.score(X_train2, y_train),"\n(Kaggle: 0.)")
# Variance 0.90: 0.796431416130921
# Variance 0.80: 0.7908871733635068 

#saveFile(y_pred,'basicLinReg.csv')

# SVC Classifier Implementation ###################################################################
svc = SVC(gamma='auto')
svc.fit(X_train, y_train)
print("\nSVC:",svc.score(X_train, y_train),"\n(Kaggle: 0.47280)")
y_pred = svc.predict(X_test)
saveFile(y_pred,'basicSVC.csv')

reg2 = SVC(gamma='auto')
reg2.fit(new_X_train, y_train)
y_pred = reg2.predict(new_X_test)
print("SVC (Feat. Selection):",reg2.score(new_X_train, y_train),"\n(Kaggle: 0.)")
# 60 Features: 1.0
# 80 Features: 1.0
# 90 Features: 1.0
# 100 Features: 1.0
# 110 Features: 1.0

reg3 = SVC(gamma='auto')
reg3.fit(X_train2, y_train)
y_pred = reg3.predict(X_test2)
print("SVC (Feat. Selection):",reg3.score(X_train2, y_train),"\n(Kaggle: 0.)")
# Variance 0.90: 1.0
# Variance 0.85: 1.0
# Variance 0.90: 1.0


#saveFile(y_pred,'SVCFeatSelect.csv')



Linear Regression: 0.8228384794054174 
(Kaggle: 0.36820)
Linear Regression (Feat. Selection): 0.8436246969455491 
(Kaggle: 0.)
Linear Regression (Feat. Selection 2): 0.8085832361913172 
(Kaggle: 0.)
Linear Regression (PCA): 0.8064496058438628 
(Kaggle: 0.)

Transforemed Linear Regression: 0.9797550359348112 
(Kaggle: 0.62761)
Transforemed Linear Regression (Feat. Selection): 0.9797550359348112 
(Kaggle: 0.)
Transforemed Linear Regression (Feat. Selection 2): 0.9190201437392449 
(Kaggle: 0.)
Transforemed Linear Regression (PCA): 0.9392651078044336 
(Kaggle: 0.)

Logistic Regression: 0.61 
(Kaggle: 0.59832)
Logistic Regression (Feat. Selection): 0.61 
(Kaggle: 0.)
Logistic Regression (Feat. Selection 2): 0.61 
(Kaggle: 0.)
Logistic Regression (PCA): 0.835 
(Kaggle: 0.)

Ridge Regression: 0.8031060028098278 
(Kaggle: 0.42677)
Ridge Regression (Feat. Selection): 0.7939592598975257 
(Kaggle: 0.)
Ridge Regression (Feat. Selection) 2: 0.796431416130921 
(Kaggle: 0.)

SVC: 1.0 
(Kaggle: 0.472

  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  % (self.n_quantiles, n_samples))
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


SVC (Feat. Selection): 1.0 
(Kaggle: 0.)
SVC (Feat. Selection): 1.0 
(Kaggle: 0.)


In [5]:
# LOGISTIC REGRESSION

#Default = 0.61

clf = LogisticRegression(C=0.01)#(random_state=0, solver='lbfgs',multi_class='multinomial')
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf.score(X_train, y_train))



0.61


In [37]:
## SPLITTING
X_trainSPLIT, X_testSPLIT, y_trainSPLIT, y_testSPLIT = train_test_split(X_train,y_train,test_size=0.3,random_state=0)

print("# training samples: ", len(X_trainSPLIT))
print("# testing samples: ", len(X_testSPLIT))


## SCALING
sc = StandardScaler()
X_train_split_std = sc.fit_transform(X_trainSPLIT)
X_test_split_std = sc.transform(X_testSPLIT)

X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.fit_transform(X_test)

X_train_scaled_feat = sc.fit_transform(new_X_train)
X_test_scaled_feat = sc.fit_transform(new_X_test)

# training samples:  140
# testing samples:  60


In [32]:
#Testing Data with different parameter values
C_param_range = [0.001,0.01,0.1,1,10,100]
for i in C_param_range:
    
    # Apply logistic regression model to training data
    lr = LogisticRegression(penalty = 'l2', C = i,random_state = 0)
    lr.fit(X_train_split_std,y_trainSPLIT)

    # Predict using model
    y_pred = lr.predict(X_test_split_std)
    
    lr2 = LogisticRegression(penalty = 'l2', C = i,random_state = 0)
    lr2.fit(X_trainSPLIT,y_trainSPLIT)
    
    # Saving accuracy score in table
    print("C value:",i,"      Accuracy:",accuracy_score(y_testSPLIT,y_pred))
    
print("Score:",lr.score(X_trainSPLIT, y_trainSPLIT))
print("Score:",lr2.score(X_train, y_train))
    

C value: 0.001       Accuracy: 0.85
C value: 0.01       Accuracy: 0.9
C value: 0.1       Accuracy: 0.8666666666666667
C value: 1       Accuracy: 0.8833333333333333
C value: 10       Accuracy: 0.8333333333333334
C value: 100       Accuracy: 0.85
Score: 0.55
Score: 0.565




In [38]:
# Logistic Regression - Further Tuning with Grid Search
lr = LogisticRegression()
# Look at parameters used by our regression
print('Parameters currently in use:\n')
print(lr.get_params())

#Creating the Random Grid
pipe = Pipeline([('classifier' , LogisticRegression())])

param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear', 'saga'],
    'classifier__max_iter' : [2,5,10,15,25,50,100,150,200]},
]

#print("\n",param_grid)

Parameters currently in use:

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'warn', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'warn', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [39]:
# Fit Model LOGISTIC REG

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
clf = GridSearchCV(pipe, param_grid = param_grid, cv = 10, verbose=True, n_jobs=-1)
# Fit the random search model
best_clf = clf.fit(X_train_scaled_feat, y_train)


best_clf.best_params_

Fitting 10 folds for each of 720 candidates, totalling 7200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 3032 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 7200 out of 7200 | elapsed:   45.2s finished


{'classifier': LogisticRegression(C=0.03359818286283781, class_weight=None, dual=False,
                    fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                    max_iter=10, multi_class='warn', n_jobs=None, penalty='l1',
                    random_state=None, solver='saga', tol=0.0001, verbose=0,
                    warm_start=False),
 'classifier__C': 0.03359818286283781,
 'classifier__max_iter': 10,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

In [40]:
clf = LogisticRegression()
clf = clf.fit(X_train, y_train)

print("Basic Score:",clf.score(X_train, y_train))
y_predBasic = clf.predict(X_test)

print("Tuning Score:",best_clf.score(X_train_scaled_feat, y_train))

# 'FeatTuningLR.csv'       - 100 Feats = 0.85  = 0.84937 Kaggle
# 'ScaledTuningLR.csv'                 = 0.85  = 0.84937 Kaggle

# 'FeatScaledTuningLR.csv' - 110 Feats = 0.845 = 0.89121 Kaggle
# 'FeatScaledTuningLR.csv' - 100 Feats = 0.85  = 0.88702 Kaggle
# 'FeatScaledTuningLR.csv' - 90 Feats  = 0.87  = 0.89539 Kaggle
# 'FeatScaledTuningLR.csv' - 85 Feats  = 0.88  = 0.92050 Kaggle
# 'FeatScaledTuningLR.csv' - 80 Feats  = 0.845 = 0.89958 Kaggle
# 'FeatScaledTuningLR.csv' - 70 Feats  = 0.87  = 0.88702 Kaggle



y_pred = best_clf.predict(X_test_scaled_feat)
saveFile(y_pred,'FeatScaledTuningLRTEST.csv')




Basic Score: 0.61
Tuning Score: 0.855


In [51]:
# SVC Classifier Tuning
svc = SVC().fit(X_train,y_train)
y_pred = svc.predict(X_test)
print("Basic SVC",svc.score(X_train, y_train))

svc2 = SVC().fit(X_train_scaled,y_train)
y_pred = svc2.predict(X_test_scaled)
print("Scaled SVC",svc2.score(X_train_scaled, y_train))
#saveFile(y_pred,'scaledSVC.csv')

svc3 = SVC().fit(X_train_scaled_feat,y_train)
y_pred = svc3.predict(X_test_scaled_feat)
print("Scaled SVC",svc3.score(X_train_scaled_feat, y_train))
#saveFile(y_pred,'scaledSVCTopFeats.csv')
# 110 Features: 0.93
# 100 Features: 0.94
# 90 Features: 0.925
# 80 Features: 0.925

# Look at parameters used by our regression
print('Parameters currently in use:\n')
print(svc.get_params())

#Creating the Random Grid
#pipe = Pipeline([('classifier' , SVC())])

param_grid = [
    {'kernel' : ['linear','rbf'],
     'gamma' :[1,0.1,0.001,0.0001],
     'C' : [0.0001, 0.001,0.01, 0.1, 1, 10, 100, 1000]},
]

#print("\n",param_grid)

Basic SVC 1.0
Scaled SVC 0.935
Scaled SVC 0.93
Parameters currently in use:

{'C': 1.0, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'auto_deprecated', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}




In [52]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
#svc = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
svc = GridSearchCV(SVC(),param_grid,refit = True)
# Fit the random search model
best_svc = svc.fit(X_train_scaled_feat, y_train)


best_svc.best_params_



{'C': 0.1, 'gamma': 1, 'kernel': 'linear'}

In [53]:
svc = SVC().fit(X_train, y_train)

print("Basic Score:",svc.score(X_train, y_train))
y_predBasic = svc.predict(X_test)
#saveFile(y_predBasic,'basicSVC.csv')

print("Tuning Score:",best_svc.score(X_train_scaled_feat, y_train))
y_pred = best_svc.predict(X_test_scaled_feat)
saveFile(y_pred,'ScaledTunedFeatSVC.csv')

# Tuned + Scaled = 0.865

# Tuned + Scaled + Feat Selected 115 =  Kaggle
# Tuned + Scaled + Feat Selected 110 = 0.88702 Kaggle
# Tuned + Scaled + Feat Selected 105 = 0.86192 Kaggle
# Tuned + Scaled + Feat Selected 100 = 0.86192 Kaggle
# Tuned + Scaled + Feat Selected 90 = 0.85355 Kaggle
# Tuned + Scaled + Feat Selected 80 = 0.84518 Kaggle




Basic Score: 1.0
Tuning Score: 0.935


