# Grid search for LinearSVC

In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

from sklearn.svm import LinearSVC

In [2]:
def gridsearch():
    def load2():
        # load and clean data
        filepath = '~/fake_news/1mio-clean.csv'
        df = pd.read_csv(filepath,usecols=[3,5])
        df = df.dropna()
        df['b_type'] = df['type'].apply(lambda x: 'FAKE' if x in ['fake','satire','bias',
                                                                  'conspiracy','junksci'] else 'REAL')
        X, y = df['content'], df['b_type']
        
        # tf-idf
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(X)
        return (X,y)
    
    X,y= load2()
    
    # grid search
    param_grid = {'C':[1,10,50,100,150,200]}
    gs = GridSearchCV(LinearSVC(), param_grid=param_grid, cv=5)
    gs.fit(X, y)
    
    print(gs.cv_results_)
    print(gs.best_params_)
    
gridsearch()



{'mean_fit_time': array([ 57.3751792 , 389.88415475, 593.46790943, 606.31561642,
       613.43005953, 622.84552917]), 'std_fit_time': array([ 4.36892158, 35.07715998, 20.7568969 , 24.34150044, 27.25752813,
       25.05422964]), 'mean_score_time': array([1.50826998, 2.517555  , 2.5302794 , 2.49420462, 2.48841724,
       2.46283474]), 'std_score_time': array([0.37549849, 0.22365498, 0.20960239, 0.25929919, 0.26022899,
       0.23066044]), 'param_C': masked_array(data=[1, 10, 50, 100, 150, 200],
             mask=[False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'C': 1}, {'C': 10}, {'C': 50}, {'C': 100}, {'C': 150}, {'C': 200}], 'split0_test_score': array([0.86788075, 0.85101341, 0.83618113, 0.83100847, 0.82851814,
       0.82728943]), 'split1_test_score': array([0.85155564, 0.83914231, 0.82609817, 0.8204702 , 0.81913178,
       0.8171406 ]), 'split2_test_score': array([0.87830218, 0.86342593, 0.84692601, 0.8391478 , 0.83458399,
    

so the best parameter is C=1