In [1]:
import preprocessing
from ngramGenerator import *
from featureIdentifier import *
from mlModel import *

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier


# Validation libraries
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error, precision_recall_curve
from sklearn.model_selection import cross_val_score


#Bagging
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

#Naive bayes
from sklearn.naive_bayes import GaussianNB 

from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from pandas import DataFrame


In [2]:

def main():
    articles, train_labels_set,  test_labels_set = [], set(), set()

    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    ''' Pre-processing                                                   '''
    ''' (1) Load data and split data into train/test sets                '''
    ''' (2) Hashset the labels and remove labels on the data             '''
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    # add all files' data into articles
    preprocessing.read_data(articles)

    # split data to train and test sets
    train_set, test_set = preprocessing.data_split(articles)
    train_label_count, test_label_count = 0, 0

    # take off label and add names to labels
    for i in range(len(train_set)):
        train_set[i], train_label_count, train_labels_set =\
            preprocessing.label_extraction_takeoff(paragraphs=train_set[i], count=train_label_count, labels=train_labels_set)

    for i in range(len(test_set)):
        test_set[i], test_label_count, test_labels_set =\
            preprocessing.label_extraction_takeoff(paragraphs=test_set[i], count=test_label_count, labels=test_labels_set)

    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    ''' N-gram generation                                                '''
    ''' (1) Generate all n-gram (with first feature whether contains 's) '''
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    

    train_ngram_result, test_ngram_result, train_single_gram2 = [], [], []
    train_single_gram, test_single_gram, test_single_gram2 = [], [], []        # save single ones in order for later use

    for i in range(len(train_set)):
        ngrams, singles, singles2 = generate_ngrams(filename=train_set[i][0], content=train_set[i][1], n=5)
        train_ngram_result.append(ngrams)
        train_single_gram.append(singles)
        train_single_gram2.append(singles2)

    for i in range(len(test_set)):
        ngrams, singles, singles2 = generate_ngrams(filename=test_set[i][0], content=test_set[i][1], n=5)
        test_ngram_result.append(ngrams)
        test_single_gram.append(singles)
        test_single_gram2.append(singles2)

    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    ''' Take out n-gram with only lowercase (only for training data)     '''
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    for index in range(len(train_ngram_result)):
        train_ngram_result[index] = eliminate_all_lower(train_ngram_result[index])

    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    ''' Create a test ngram result without n-gram has only lowercase     '''
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    test_ngram_result_without_all_lower = test_ngram_result[:]
    for index in range(len(test_ngram_result_without_all_lower)):
        test_ngram_result_without_all_lower[index] = eliminate_all_lower(test_ngram_result_without_all_lower[index])

    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    ''' Feature creation                                                 '''
    ''' (1) 's (added during generation of ngram)                        '''
    ''' (2) country                                                      '''
    ''' (3) conjunction                                                  '''
    ''' (4) all capitalised                                              '''
    ''' (5) prefix before n-gram                                         '''
    ''' (6) verbs for humans                                             '''
    ''' (7) prefix in n-gram                                             '''
    ''' (8) after preposition                                            '''
    ''' (9) contains organization                                        '''
    ''' (10) comma before n-gram                                         '''
    ''' (11) start of a sentence                                         '''
    ''' (12) has no more than 1 word without capitalised starting letter '''
    ''' (13) contains month                                              '''
    ''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
    country_set, conjunction_set, prefix_set, verb_set , preposition_set, organ_set, month_set= \
        load_country_file(), load_conjunction_file(), load_prefix_library(), load_verb_file(), load_preposition_file(), load_organ_library(),load_month_file()
    for ngram_set_index in range(len(train_ngram_result)):
        for ngram_index in range(len(train_ngram_result[ngram_set_index])):
            ngram = train_ngram_result[ngram_set_index][ngram_index]

            train_ngram_result[ngram_set_index][ngram_index] = ngram +\
                (contains_country(ngram=ngram, country_set=country_set),
                 contains_conjunction(ngram=ngram, conjunctions_set=conjunction_set),
                 is_all_upper(ngram=ngram),
                 has_prefix_before_ngram(ngram=ngram, single_grams=train_single_gram[ngram_set_index], prefix_set=prefix_set),
                 has_human_verb(ngram=ngram, single_grams=train_single_gram[ngram_set_index], verb_set=verb_set),
                 contains_prefix(ngram=ngram, prefix_set=prefix_set),
                 afterpreposition(ngram=ngram, single_grams=train_single_gram[ngram_set_index], preposition_set=preposition_set),
                 contains_organization(ngram=ngram, organ_set=organ_set),
                 has_comma_before_ngram(ngram=ngram, single_grams2=train_single_gram2[ngram_set_index]),
                 has_fullstop_before_ngram(ngram=ngram, single_grams2=train_single_gram2[ngram_set_index]),
                 has_duplicate(ngram=ngram),
                 count_occurrences(ngram=ngram, single_grams=train_single_gram[ngram_set_index]),
                 no_more_than_one_lower(ngram=ngram),
                 contains_month(ngram=ngram, month_set=month_set),)

    for ngram_set_index in range(len(test_ngram_result_without_all_lower)):
        for ngram_index in range(len(test_ngram_result_without_all_lower[ngram_set_index])):
            ngram = test_ngram_result_without_all_lower[ngram_set_index][ngram_index]

            test_ngram_result_without_all_lower[ngram_set_index][ngram_index] = ngram +\
                (contains_country(ngram=ngram, country_set=country_set),
                 contains_conjunction(ngram=ngram, conjunctions_set=conjunction_set),
                 is_all_upper(ngram=ngram),
                 has_prefix_before_ngram(ngram=ngram, single_grams=test_single_gram[ngram_set_index], prefix_set=prefix_set),
                 has_human_verb(ngram=ngram, single_grams=test_single_gram[ngram_set_index], verb_set=verb_set),
                 contains_prefix(ngram=ngram, prefix_set=prefix_set),
                 afterpreposition(ngram=ngram, single_grams=test_single_gram[ngram_set_index], preposition_set=preposition_set),
                 contains_organization(ngram=ngram, organ_set=organ_set),
                 has_comma_before_ngram(ngram=ngram, single_grams2=test_single_gram2[ngram_set_index]),
                 has_fullstop_before_ngram(ngram=ngram, single_grams2=test_single_gram2[ngram_set_index]),
                 has_duplicate(ngram=ngram),
                 count_occurrences(ngram=ngram, single_grams=test_single_gram[ngram_set_index]),
                 no_more_than_one_lower(ngram=ngram),
                 contains_month(ngram=ngram, month_set=month_set),)

                
                                
    return train_ngram_result,test_ngram_result_without_all_lower,train_labels_set,test_labels_set

'''
    new_train = [ngram[4:] for ngram in train_ngram_result[0]]
    label = [1 if ngram[0] in labels_set else 0 for ngram in train_ngram_result[0]]
    tree = build_decision_tree(new_train, label)
    print sum([1 if a == b else 0 for a, b in zip(tree.predict(new_train), label)])
    print len(label)
'''


'\n    new_train = [ngram[4:] for ngram in train_ngram_result[0]]\n    label = [1 if ngram[0] in labels_set else 0 for ngram in train_ngram_result[0]]\n    tree = build_decision_tree(new_train, label)\n    print sum([1 if a == b else 0 for a, b in zip(tree.predict(new_train), label)])\n    print len(label)\n'

In [3]:
if __name__ == "__main__":
    train_ngram_result, test_ngram_result_without_all_lower, train_labels_set, test_labels_set= main()


In [4]:

df=pd.DataFrame(train_ngram_result[0],columns=['elements','file','loc_start','loc_end','end_\'s','country','conjunction','capitalised','prefix','verb','prefix_in','preposition','organ','comma','fullstop','duplicate','count','no_more','month'])
true_label=[]
for i in range(len(df['elements'])):
    if list(df['elements'])[i] in train_labels_set:
        true_label.append(1)
    else:
        true_label.append(0)
df['true_label']=true_label
true_list = df.loc[(df['true_label']== 1),:].index.tolist()
new_list=[]
for i in true_list:
    if len(df['elements'][i].split()) > 1:
        new_list.append(i)   
            
for i in new_list:
    for j in range(len(df['elements'])):
        if (df['file'][j] != 'NULL') & (df['file'][i] != 'NULL'):
            if (df['file'][j] == df['file'][i]) & (df['loc_start'][j] >= df['loc_start'][i]) & (df['loc_end'][j] <= df['loc_end'][i]) & ((df['loc_end'][j]-df['loc_start'][j])!=(df['loc_end'][i]-df['loc_start'][i])):
                df.iloc[j,:] = 'NULL'
                    
df = df.loc[(df['file'] != 'NULL'),:]

In [5]:
for i in range(1,len(train_ngram_result)):
    df_temp = pd.DataFrame(train_ngram_result[i],columns=['elements','file','loc_start','loc_end','end_\'s','country','conjunction','capitalised','prefix','verb','prefix_in','preposition','organ','comma','fullstop','duplicate','count','no_more','month'])
    true_label=[]
    for i in range(len(df_temp['elements'])):
        if list(df_temp['elements'])[i] in train_labels_set:
            true_label.append(1)
        else:
            true_label.append(0)
    df_temp['true_label']=true_label
    true_list = df_temp.loc[(df_temp['true_label']== 1),:].index.tolist()
    new_list=[]
    for i in true_list:
        if len(df_temp['elements'][i].split()) > 1:
            new_list.append(i)   
            
    for i in new_list:
        for j in range(len(df_temp['elements'])):
            if (df_temp['file'][j] != 'NULL') & (df_temp['file'][i] != 'NULL'):
                if (df_temp['file'][j] == df_temp['file'][i]) & (df_temp['loc_start'][j] >= df_temp['loc_start'][i]) & (df_temp['loc_end'][j] <= df_temp['loc_end'][i]) & ((df_temp['loc_end'][j]-df_temp['loc_start'][j])!=(df_temp['loc_end'][i]-df_temp['loc_start'][i])):
                    df_temp.iloc[j,:] = 'NULL'
                    
    df_append = df_temp.loc[(df_temp['file'] != 'NULL'),:]
    df = df.append(df_append, ignore_index=True)

## random forest

In [6]:
length=[]
for i in range(len(df)):
    length.append(int(df['loc_end'][i])-int(df['loc_start'][i])+1)
    pattern1 = re.compile('[0-9]+')
    match1 = pattern1.findall(df['elements'][i])
    if match1:
        df.iloc[i,:]='NULL'
    pattern2 = re.compile('-')
    match2 = pattern2.findall(df['elements'][i])
    if match2:
        df.iloc[i,:]='NULL'
    
df['length']=length
df=df.loc[df['elements']!='NULL',:]
'''
df=df[['elements','file','loc_start','loc_end','length','end_\'s','country','conjunction','capitalised','prefix','verb','prefix_in','preposition','organ','comma','fullstop','duplicate','count','no_more','month','true_label']]
'''
#df=df[['elements','file','loc_start','loc_end','length','end_\'s','country','conjunction','capitalised','prefix','verb','prefix_in','preposition','organ','fullstop','duplicate','count','no_more','month','true_label']]

In [21]:
df=df[['elements','file','loc_start','loc_end','length','end_\'s','country','conjunction','capitalised','prefix','verb','prefix_in','preposition','organ','fullstop','duplicate','count','no_more','month','true_label']]

In [22]:
X = df.iloc[:,4:19].values.astype('int')
y = df.iloc[:,19].values.astype('int')

In [23]:
methodDict = {}
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# just use original imbalanced data

In [9]:
pipe = make_pipeline(StandardScaler(),
                     RandomForestClassifier())

param_grid = {'n_estimators': list(range(1, 50))}

gs = GridSearchCV(estimator=RandomForestClassifier(), 
                  param_grid=param_grid, 
                  iid=False,
                  n_jobs=-1,
                  refit=True,
                  scoring='accuracy',
                  cv=10)

gs.fit(X, y)

print('Best Accuracy: %.2f%%' % (gs.best_score_*100))
print('Best Params: %s' % gs.best_params_)
print('Test Accuracy: %.2f%%' % (gs.best_estimator_.score(X, y)*100))

KeyboardInterrupt: 

In [11]:
RandomForest=RandomForestClassifier(n_estimators=15,n_jobs=-1,criterion='gini',
                     random_state=42)
RandomForest.fit(X, y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [65]:
test_df=pd.read_csv("test.csv")
test_df=test_df.drop(["comma"],axis=1)
X_test = test_df.iloc[:,4:19].values.astype('int')
y_test = test_df.iloc[:,19].values.astype('int')

In [13]:
y_predict=RandomForest.predict(X_test)

In [10]:
def confusion_matrix(y_true, y_predicted):

    unique_labels = np.unique(np.concatenate((y_true, y_predicted)))
    num_labels=len(unique_labels)
    matrix = np.zeros(num_labels*num_labels).reshape(num_labels, num_labels).astype(int)
    for i, j in zip(y_true, y_predicted):
        matrix[i,j]+=1
    
    return matrix

In [15]:
result_matrix = confusion_matrix(y_test, y_predict)
result_matrix

array([[30418,   183],
       [  491,   270]])

# using SMOTE to deal with imbalance

In [16]:
from imblearn.over_sampling import SMOTE
smo = SMOTE(random_state=42)
X_smo, y_smo = smo.fit_sample(X, y)

In [17]:
from collections import Counter
print(Counter(y_smo))

Counter({1: 62450, 0: 62450})


In [18]:
pipe = make_pipeline(StandardScaler(),
                     RandomForestClassifier())

param_grid = {'n_estimators': list(range(1, 50))}

gs = GridSearchCV(estimator=RandomForestClassifier(), 
                  param_grid=param_grid, 
                  iid=False,
                  n_jobs=-1,
                  refit=True,
                  scoring='accuracy',
                  cv=10)

gs.fit(X_smo, y_smo)

print('Best Accuracy: %.2f%%' % (gs.best_score_*100))
print('Best Params: %s' % gs.best_params_)
print('Test Accuracy: %.2f%%' % (gs.best_estimator_.score(X_smo, y_smo)*100))

Best Accuracy: 95.04%
Best Params: {'n_estimators': 5}
Test Accuracy: 95.05%


In [None]:

RandomForest=RandomForestClassifier(n_estimators=2,n_jobs=-1,criterion='gini',
                     random_state=42)
RandomForest.fit(X_smo, y_smo)


In [55]:
y_predict=RandomForest.predict(X_test)

In [21]:
result_matrix = confusion_matrix(y_test, y_predict)
result_matrix

array([[27597,  3004],
       [   23,   738]])

In [56]:
test_df['predicted']=list(y_predict)
true_list = test_df.loc[(test_df['predicted']== 1),:].index.tolist()
new_list=[]
for i in true_list:
    if len(test_df['elements'][i].split()) > 1:
        new_list.append(i)   
    if test_df['preposition'][i]==0:
        test_df['predicted'][i]=0

for i in new_list:
    for j in range(len(test_df['elements'])):
        if (test_df['file'][j] == test_df['file'][i]) & (test_df['loc_start'][j] >= test_df['loc_start'][i]) & (test_df['loc_end'][j] <= test_df['loc_end'][i]) & ((test_df['loc_end'][j]-test_df['loc_start'][j])!=(test_df['loc_end'][i]-test_df['loc_start'][i])):
            test_df['true_label'][j]=0
            test_df['predicted'][j]=0 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


KeyboardInterrupt: 

In [None]:
result_matrix = confusion_matrix(test_df['true_label'], test_df['predicted'])
result_matrix

## fix imbalance with RUS


In [62]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
rus = RandomUnderSampler(ratio={0: 6000}, random_state=0)
X_rus, y_rus = rus.fit_sample(X, y)
print(Counter(y_rus))

Counter({0: 6000, 1: 1278})


In [63]:
pipe = make_pipeline(StandardScaler(),
                     RandomForestClassifier())

param_grid = {'n_estimators': list(range(1, 50))}

gs = GridSearchCV(estimator=RandomForestClassifier(), 
                  param_grid=param_grid, 
                  iid=False,
                  n_jobs=-1,
                  refit=True,
                  scoring='accuracy',
                  cv=10)

gs.fit(X_rus, y_rus)

print('Best Accuracy: %.2f%%' % (gs.best_score_*100))
print('Best Params: %s' % gs.best_params_)
print('Test Accuracy: %.2f%%' % (gs.best_estimator_.score(X_rus, y_rus)*100))

Best Accuracy: 94.08%
Best Params: {'n_estimators': 4}
Test Accuracy: 94.82%


In [64]:
RandomForest=RandomForestClassifier(n_estimators=4,n_jobs=-1,criterion='gini',
                     random_state=42)
RandomForest.fit(X_rus, y_rus)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=4, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [66]:
y_predict=RandomForest.predict(X_test)

In [67]:
result_matrix = confusion_matrix(y_test, y_predict)
result_matrix

array([[25003,   999],
       [  133,   558]])

In [68]:
test_df['predicted']=list(y_predict)
true_list = test_df.loc[(test_df['predicted']== 1),:].index.tolist()
new_list=[]
for i in true_list:
    if len(test_df['elements'][i].split()) > 1:
        new_list.append(i)  

wrong_list = test_df.loc[(test_df['predicted']!=0)|(test_df['true_label']!=0),:].index.tolist()
for i in new_list:
    for j in wrong_list:
        if (test_df['file'][j] == test_df['file'][i]) & (test_df['loc_start'][j] >= test_df['loc_start'][i]) & (test_df['loc_end'][j] <= test_df['loc_end'][i]) & ((test_df['loc_end'][j]-test_df['loc_start'][j])!=(test_df['loc_end'][i]-test_df['loc_start'][i])):
            test_df['true_label'][j]=0
            test_df['predicted'][j]=0 
            print(j)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


82
89
89
188
169
173
188
190
190
360
363
363
315
319
363
365
365
454
458
564
718
823
904
1115
1119
1119
1120
1140
1141
1167
1310
1314
1338
1338
1476
1544
1496
1500
1544
1546
1546
1594
1650
1654
1892
1904
1904
2055
2171
2219
2220
2285
2288
2315
2319
2340
2340
2394
2481
2481
2851
2985
3288
3320
3337
3338
3339
3338
3339
3413
3414
3360
3363
3413
3414
3415
3414
3415
3418
3419
3384
3418
3419
3420
3420
3424
3458
3459
3479
3482
3483
3484
3588
3501
3588
3589
3588
3589
3592
3593
3697
3697
3698
3702
3704
3707
3726
3796
3798
3798
3800
3803
3804
3757
3761
3803
3804
3805
3804
3805
3808
3808
3809
4035
4200
4204
4331
4335
4376
4347
4351
4376
4376
4626
4629
4761
4765
5041
5045
5081
5226
5303
5305
5307
5422
5339
5422
5422
5599
5600
5603
5603
5675
5635
5637
5675
5675
5679
5757
5691
5695
5757
5759
5759
5762
5762
5765
5954
5957
5964
5964
6064
6065
6068
6300
6287
6291
6665
6666
6670
6747
6871
6856
6871
6878
6882
6963
6963
6964
6939
6943
6963
6964
6965
6964
6965
7028
7058
7184
7301
7303
7303
7271
7275
7303
7

In [19]:
from pandas import DataFrame
DataFrame.to_csv(test_df.loc[(test_df['predicted']==1)&(test_df['true_label']==0),:],"debug.csv",index=False)

In [69]:
result_matrix = confusion_matrix(test_df['true_label'], test_df['predicted'])
result_matrix

array([[25633,   564],
       [  162,   334]])

In [70]:
test_df.loc[(test_df['predicted']==1)&(test_df['true_label']==0),:]

Unnamed: 0,elements,file,loc_start,loc_end,length,end_'s,country,conjunction,capitalised,prefix,...,prefix_in,preposition,organ,fullstop,duplicate,count,no_more,month,true_label,predicted
152,States Declaration,276,92,93,2,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,1
170,David Hume Rousseau,276,124,126,3,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,1
303,Hilton Hotels,48,0,1,2,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,1
307,Hotels CEO,48,1,2,2,0,0,0,1,0,...,1,0,0,0,0,1,1,0,0,1
311,CEO Christopher,48,2,3,2,0,0,0,1,0,...,1,0,0,0,0,1,1,0,0,1
352,York City,48,19,20,2,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,1
370,Signia Hilton,48,37,38,2,0,0,0,1,0,...,0,0,0,0,0,2,1,0,0,1
401,Signia Hilton,48,78,79,2,0,0,0,1,0,...,0,0,0,0,0,2,1,0,0,1
419,Restoration Hardware,48,90,91,2,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,1
463,Hilton,48,107,107,1,0,0,0,1,0,...,0,0,0,0,0,4,1,0,0,1


## Logistic Regression

## NB

In [None]:
nb_classifier = build_nb_classifier(X, y)
y_predict=nb_classifier.predict(X_test)
result_matrix = confusion_matrix(y_test, y_predict)
result_matrix

## SVM

In [None]:
from sklearn import svm

In [None]:
model = svm.SVC(kernel='linear', gamma=1) 

In [None]:
from sklearn import svm

pipe = make_pipeline(StandardScaler(),
                     svm.SVC())

param_grid = {'C': list(np.arange(0.1,1,0.1))}

gs = GridSearchCV(estimator= svm.SVC() , 
                  param_grid=param_grid, 
                  iid=False,
                  n_jobs=-1,
                  refit=True,
                  scoring='accuracy',
                  cv=5)

gs.fit(X_rus, y_rus)

print('Best Accuracy: %.2f%%' % (gs.best_score_*100))
print('Best Params: %s' % gs.best_params_)
print('Test Accuracy: %.2f%%' % (gs.best_estimator_.score(X_rus, y_rus)*100))

In [None]:
svm=svm.SVC(C=0.9)
svm.fit(X_rus, y_rus)
y_predict=svm.predict(X_test)

In [None]:
result_matrix = confusion_matrix(y_test, y_predict)
result_matrix

In [None]:
y_test

In [None]:
y_predict