In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import datetime
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import HashingVectorizer
from imblearn import over_sampling
from imblearn.over_sampling import SVMSMOTE

def overSample(X,y):
    
    oversample = SVMSMOTE(n_jobs=-1,random_state=47)
    X, y = oversample.fit_resample(X, y)
    return X,y

def run_model(vectorizer,pipe,param_grid):
    content=''
    
    training_df = pd.read_csv('PreTraining.csv')
    testing_df = pd.read_csv('PreTesting.csv')
    #X_train, X_test, y_train, y_test = train_test_split(training_df['text2'], training_df['target'],
    #X_train, X_test, y_train, y_test = train_test_split(training_df['processed'], training_df['target'],
    #test_size=0.30, # should try different values
    #random_state=42)
            
    #should balance based on target

    X_train = training_df['processed']
    y_train = training_df['target']
    X_test = testing_df['processed']
    y_test = testing_df['target']

        #should balance the target

    grid = GridSearchCV(
        pipe,
        param_grid,
        cv=10,
        scoring=['accuracy'],
        refit='accuracy',
        return_train_score=True,
        n_jobs=32,
        #error_score=1,
        #verbose=4,
        )
    start_time = datetime.datetime.now()
    grid.fit(X_train,y_train)
    #grid.transform(X_train)
    y_pred=grid.predict(X_test)
    test_score=accuracy_score(y_test, y_pred)
    finish_time = datetime.datetime.now()
    content+=("\n"+vectorizer+"\nstart_time\t\t\t\t"+
        start_time.strftime("%m/%d/%Y, %H:%M:%S"))
    seconds = (finish_time - start_time).total_seconds()
    content+=("\nseconds\t\t\t\t"+"{:.2f}".format(seconds) )
    content+=("\nMinutes\t\t\t\t"+ "{:.2f}".format(seconds/60) )
    for k,v in grid.best_params_.items():
        content+=("\nparam: "+str(k)+"\t\t\t\t"+str(v))
    content+=("\ngrid.best_score_\t\t\t\t"+"{:.4f}".format(grid.best_score_))
    content+=("\ngrid.refit_time_\t\t\t\t"+"{:.4f}".format(grid.refit_time_))
    content+=("\ntest_score\t\t\t\t"+"{:.4f}".format(test_score))
    content+=("\n")
    file1 = open("results.txt", "a")  # append mode
    file1.write(content)
    file1.close()
    return content
###### function end

'''
time
figure : loss
confusion matrix
accuracy
'''

'\ntime\nfigure : loss\nconfusion matrix\naccuracy\n'

In [None]:
##### CountVectorizer
cv_pipe= Pipeline([
    ('vect', CountVectorizer()),
    ('svc', SVC()),
    ])
cv_param_grid= {
    'vect__stop_words':['english',None],
    'vect__max_df':[0.25,0.35,0.5,0.75,1],
    'vect__min_df':[1,0.0000088], #float(1/112500) = 0.0000088; 112500 ~ words count
    'vect__ngram_range':[(1,1),(1,2)],
    'svc__kernel':['rbf','linear', 'poly', 'sigmoid'],
    'svc__C':[0.5,1.00,1.5],
    'svc__degree':[1,3],
}
#run_model(cv_pipe,cv_param_grid)

In [None]:
### HashingVectorizer
hash_pipe= Pipeline([
    ('vect', HashingVectorizer()),
    ('svc', SVC()),
    ])
hash_param_grid= {
    'vect__stop_words':['english',None],
    'vect__ngram_range':[(1,1),(1,2)],
    'vect__norm':['l1','l2'],
    
    'svc__kernel':['rbf', 'poly',],
    #'svc__C':[0.5,1.00,1.5],
    'svc__degree':[1,3],
}
#run_model('HashVectorizer',hash_pipe,hash_param_grid)

In [None]:
##### TFIDF Vectorizer
tfidf_pipe= Pipeline([
    ('vect', TfidfVectorizer()),
    ('svc', SVC()),
    ])
tfidf_param_grid= {
    #'vect__stop_words':['english',None],
    'vect__max_df':[0.0625,0.125,0.25,0.35,0.5,0.75,1],
    'vect__min_df':[1,0.00002,0.0000088], #float(1/112500) = 0.0000088; 112500 ~ words count
    'vect__ngram_range':[(1,1),(1,2)],
    'vect__norm':['l1','l2'],
    'vect__use_idf':[True,False],
    'vect__smooth_idf':[True,False],
    'vect__sublinear_tf':[True,False],
    'svc__kernel':['rbf','linear', 'poly', 'sigmoid'],
    'svc__C':[0.5,1.00,1.5],
    'svc__degree':[1,3],
}

tfidf_param_grid= {
    #'vect__stop_words':['english',None],
    'vect__max_df':[0.125],
    #'vect__min_df':[1,0.0000088], #float(1/112500) = 0.0000088; 112500 ~ words count
    #'vect__ngram_range':[(1,1),],
    #'vect__norm':['l2'],
    'vect__use_idf':[False],
    #'vect__smooth_idf':[True,False],
    #'vect__sublinear_tf':[True,False],
    'svc__kernel':['rbf','linear', 'poly'],
    'svc__C':[0.5,1.00],   
    'svc__degree':[1,3], 
}

print( run_model('TFIDF Vectorizer',tfidf_pipe,tfidf_param_grid) )


TFIDF Vectorizer
start_time				10/22/2021, 01:30:29
seconds				679.81
Minutes				11.33
param: svc__C				0.5
param: svc__degree				1
param: svc__kernel				poly
param: vect__max_df				0.125
param: vect__use_idf				False
grid.best_score_				0.8034
grid.refit_time_				3.9009
test_score				0.8214

