In [1]:
# import packages
import pandas as pd
import numpy as np

import swifter
import great_expectations as ge

In [2]:
# import datasets
train_X = pd.read_csv('/Users/eesoonhang/Desktop/capstone_data/train_in_encodedToNacSeq.csv', usecols=lambda x: x!= '*Unnamed*')
train_y = pd.read_csv('/Users/eesoonhang/Desktop/capstone_data/train_out.csv', usecols=lambda x: x!= '*Unnamed*')
test_X = pd.read_csv('/Users/eesoonhang/Desktop/capstone_data/test_in_encodedToNacSeq.csv', usecols=lambda x: x!= '*Unnamed*')
test_y = pd.read_csv('/Users/eesoonhang/Desktop/capstone_data/test_out.csv', usecols=lambda x: x!= '*Unnamed*')

# split to respective input datasets
train_X_nac = train_X.iloc[:, 0:4]
train_X_dac = train_X.iloc[:, 4:20]
train_X_tac = train_X.iloc[:, 20:]
test_X_nac = test_X.iloc[:, 0:4]
test_X_dac = test_X.iloc[:, 4:20]
test_X_tac = test_X.iloc[:, 20:]

# compressed output y
# transform target >=1 to 1
def normalize_output(x):
    return np.where(x.target>1, x.target/x.target, x.target)
    
# compressed y-variable
compressed_train_y = train_y.sum(axis=1).to_frame(name='target')
compressed_test_y = test_y.sum(axis=1).to_frame(name='target')
compressed_train_y.target = compressed_train_y.swifter.apply(normalize_output, axis=1)
compressed_test_y.target = compressed_test_y.swifter.apply(normalize_output, axis=1)

### train without scaler

In [6]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import math

def train_Model(k='rbf', c=1, g='scale'):
    
    print('scaling data')
    #apply scaler
    scaler = StandardScaler()
    train_X_scaled = scaler.fit_transform(train_X, compressed_train_y)
    train_X_scaled = pd.DataFrame(train_X_scaled, columns=train_X.columns)
    X_train, X_valid, Y_train, Y_valid = train_test_split(train_X_scaled, compressed_train_y, test_size = 0.5, random_state = 1234, stratify=compressed_train_y)
    
    print('selecting feature via RFE..')
    # instantiate model
    svc = SVC(kernel=k, C=c, gamma=g)
    
    print('fit model...')
    # Perform RandomForestClassification
    svc.fit(X_train, Y_train.values.ravel())
    y_predict = svc.predict(X_valid)
    
    print('evaluate model...')
#     # print ranking based on rfe
#     columns = X_train.columns
#     ranking = rfe.ranking_
#     rfe_selected = pd.concat([pd.DataFrame(columns), 
#                               pd.DataFrame(ranking)], axis=1)
#     rfe_selected.columns = ['Feature Name', 'Ranking']
#     print('rankings: ' + str(ranking) + '\n' + 'RFE selected features: \n' + '-'*20 + '\n' + str(rfe_selected[(rfe_selected.Ranking == 1)]['Feature Name']))

    rmse = math.sqrt(mean_squared_error(Y_valid, y_predict))
    print('rmse: %4.2f' %rmse)
    
    score = svc.score(X_valid, Y_valid)
    print('score: %4.2f' %score)
    
    return rmse, score, svc, scaler

In [7]:
# train model
%time model = train_Model()

scaling data
selecting feature via RFE..
fit model...
evaluate model...
rmse: 0.50
score: 0.75


NameError: name 'n_rfe_features' is not defined

In [None]:
# export the model
import pickle 
pickle.dump(model[2], open('/Users/eesoonhang/Desktop/capstone_data/SVC_ovo_withScaler.pkl', 'wb'))
pickle.dump(model[3], open('/Users/eesoonhang/Desktop/capstone_data/SVC_scaler.pkl', 'wb'))

In [None]:
# evaluate accuracy
clf = model[3]
scaler = model[4]
test_X_scaled = scaler.transform(test_X)
clf.score(test_X_scaled, compressed_test_y)

In [None]:
from sklearn.metrics import confusion_matrix as cfm
import matplotlib.pyplot as plt
import seaborn as sns
# visualize confusion matrix
cnf_matrix = cfm(compressed_test_y, clf.predict(rfe_test_X)) 
class_names=[0,1] # name  of classes
fig, ax = plt.subplots() 
tick_marks = np.arange(len(class_names)) 
plt.xticks(tick_marks, class_names) 
plt.yticks(tick_marks, class_names) 

# create heatmap 
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g') 
ax.xaxis.set_label_position("top") 
plt.tight_layout() 
plt.title('Confusion matrix for SVC_ovo & RFE with 33 selected features', y=1.1) 
plt.ylabel('Actual label') 
plt.xlabel('Predicted label')