# Files donwload

Download python files from the repository

In [4]:
import os
# Arquivo .py que implementa o algoritmo bigrama
if not os.path.exists("find_best_hyperparameters.py"):
  url = "https://raw.githubusercontent.com/rubensmchaves/unb/refs/heads/main/nlp/A03_text_classifier/find_best_hyperparameters.py"
  cmd = !wget {url}

Download text corpus

In [5]:
dest_folder = "data"

# create folder
if not os.path.exists(dest_folder):
  os.mkdir(dest_folder)

# download file
if not os.path.exists(dest_folder + "/NSF.csv"):
  url = "https://raw.githubusercontent.com/ragero/text-collections/refs/heads/master/complete_texts_csvs/NSF.csv"
  cmd = !wget -P {dest_folder} {url}

if not os.path.exists(dest_folder + "/Dmoz-Science.csv"):
  url = "https://raw.githubusercontent.com/ragero/text-collections/refs/heads/master/complete_texts_csvs/Dmoz-Science.csv"
  cmd = !wget -P {dest_folder} {url}

# Text classification

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

from find_best_hyperparameters import fit_tuning
from find_best_hyperparameters import get_multinomial_naive_bayes_params
from find_best_hyperparameters import get_logistic_regression_params
from find_best_hyperparameters import get_support_vector_params
from find_best_hyperparameters import get_tfidf_params
from find_best_hyperparameters import get_tfidf_params
from find_best_hyperparameters import read_dataset

## Corpus: NSF

### Load texts

Load texts from the dataset file (corpus)

In [7]:
df, X, y = read_dataset(dest_folder + "/NSF.csv")

Raw dataframe loaded.

In [8]:
df

Unnamed: 0,file_name,text,class
0,management_data_management.a9820721.txt,an intelligent visual database system hierarch...,data
1,management_data_management.a9116988.txt,spatio-temporal database management for global...,data
2,management_data_management.a9457613.txt,nyi theory and implementation of declarative d...,data
3,management_data_management.a9734191.txt,pecase providing a coherent view of diverse di...,data
4,management_data_management.a9116798.txt,an object-oriented toolbox for use with the pr...,data
...,...,...,...
10519,computing_theory_computing.a9003356.txt,fixed-point logic in finite structures,theory
10520,computing_theory_computing.a0092761.txt,making exponential-time learning algorithms ef...,theory
10521,computing_theory_computing.a9985458.txt,efficient algorithms for problems in combinato...,theory
10522,computing_theory_computing.a9877122.txt,on learning and characterizing classes of bool...,theory


Count the amount of each class.

In [9]:
print(df["class"].value_counts())

class
economics       1409
math            1339
geophysics      1202
oceanography     990
ecology          889
sociology        739
statistics       647
politic          603
software         524
theory           442
data             402
hydro            355
networking       345
neuroscience     307
metals           201
gravitional      130
Name: count, dtype: int64


### Data split

Split the corpus into training data and test data.

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1979)

In [11]:
# Show the final size of each set.
print(f"len(X_train): {len(X_train)}")
print(f"len(X_test): {len(X_test)}")

len(X_train): 8419
len(X_test): 2105


### Pipelines

Create pipelines using TF-IDF to create training attribute and add the model.

In [12]:
pipeMNB = Pipeline([("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])
pipeLR  = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LogisticRegression(random_state=1979))])
pipeSVC = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LinearSVC())])

### GridSearch

For fine tuning we used a pipeline with TF-IDF and three different models (Multinomial Naive Bayes, Logistic Regression and Support Vector Machine). We perform hyperparameter tuning using <code>GridSearchCV</code> with 3-fold cross-validation (<code>cv</code>) and "f1_macro" as the strategy to evaluate the performance of the cross-validated model on the test set (<code>scoring</code>).

Get params for fine tuning of the models.

In [13]:
paramMNB = get_multinomial_naive_bayes_params("clf")
paramLR = get_logistic_regression_params("clf")
paramSVC = get_support_vector_params("clf")

#### Multinomial Naive Bayes

In [14]:
grid_search_MNB = fit_tuning(X_train, y_train, pipeMNB, paramMNB)

print("\nMNB best params:")
print("  Best Score: ", grid_search_MNB.best_score_)
print("  Best Params: ", grid_search_MNB.best_params_)


MNB best params:
  Best Score:  0.8222915423483269
  Best Params:  {'clf__alpha': 0.1, 'clf__fit_prior': False, 'clf__force_alpha': True}


In [15]:
print("\nCV Results:")
df_MNB = pd.DataFrame(grid_search_MNB.cv_results_)
df_MNB = df_MNB.sort_values(by="mean_test_score", ascending=False)
df_MNB


CV Results:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__alpha,param_clf__fit_prior,param_clf__force_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
2,0.094093,0.001591,0.049065,0.003977,0.1,False,True,"{'clf__alpha': 0.1, 'clf__fit_prior': False, '...",0.811992,0.827088,0.827794,0.822292,0.007288,1
3,0.098617,0.008013,0.046415,0.000784,0.1,False,False,"{'clf__alpha': 0.1, 'clf__fit_prior': False, '...",0.811992,0.827088,0.827794,0.822292,0.007288,1
6,0.095624,0.001274,0.046232,0.001005,0.255556,False,True,"{'clf__alpha': 0.25555555555555554, 'clf__fit_...",0.811012,0.823326,0.830134,0.821491,0.007914,3
7,0.099869,0.004703,0.047353,0.001247,0.255556,False,False,"{'clf__alpha': 0.25555555555555554, 'clf__fit_...",0.811012,0.823326,0.830134,0.821491,0.007914,3
10,0.098726,0.005059,0.04728,0.00121,0.411111,False,True,"{'clf__alpha': 0.4111111111111111, 'clf__fit_p...",0.803429,0.814676,0.819815,0.81264,0.006842,5
11,0.097381,0.000822,0.047692,0.001514,0.411111,False,False,"{'clf__alpha': 0.4111111111111111, 'clf__fit_p...",0.803429,0.814676,0.819815,0.81264,0.006842,5
15,0.093587,0.001144,0.046035,0.000167,0.566667,False,False,"{'clf__alpha': 0.5666666666666667, 'clf__fit_p...",0.795052,0.798933,0.814896,0.80296,0.008587,7
14,0.099233,0.00821,0.046091,0.001461,0.566667,False,True,"{'clf__alpha': 0.5666666666666667, 'clf__fit_p...",0.795052,0.798933,0.814896,0.80296,0.008587,7
19,0.097922,0.004756,0.046721,0.000408,0.722222,False,False,"{'clf__alpha': 0.7222222222222222, 'clf__fit_p...",0.775112,0.788168,0.806211,0.78983,0.01275,9
18,0.095854,0.001921,0.047057,0.000541,0.722222,False,True,"{'clf__alpha': 0.7222222222222222, 'clf__fit_p...",0.775112,0.788168,0.806211,0.78983,0.01275,9


In [16]:
# Save dataframe into CSV file
df_MNB.to_csv("NSF-mnb-results.csv")

#### Logistic Regression

In [17]:
grid_search_LR = fit_tuning(X_train, y_train, pipeLR, paramLR)

print("\nLR best params:")
print("  Best Score: ", grid_search_LR.best_score_)
print("  Best Params: ", grid_search_LR.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



LR best params:
  Best Score:  0.8349633711471355
  Best Params:  {'clf__C': 5.994842503189409, 'clf__class_weight': 'balanced', 'clf__max_iter': 100, 'clf__solver': 'lbfgs'}


In [18]:
print("\nCV Results:")
df_LR = pd.DataFrame(grid_search_LR.cv_results_)
df_LR = df_LR.sort_values(by="mean_test_score", ascending=False)
df_LR


CV Results:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__class_weight,param_clf__max_iter,param_clf__solver,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
59,3.440760,1.418895,0.102240,0.011472,5.994843,balanced,500,lbfgs,"{'clf__C': 5.994842503189409, 'clf__class_weig...",0.827383,0.837136,0.840371,0.834963,0.005520,1
55,2.675491,0.342286,0.117353,0.013390,5.994843,balanced,100,lbfgs,"{'clf__C': 5.994842503189409, 'clf__class_weig...",0.827383,0.837136,0.840371,0.834963,0.005520,1
57,3.827402,1.356909,0.116981,0.014381,5.994843,balanced,200,lbfgs,"{'clf__C': 5.994842503189409, 'clf__class_weig...",0.827383,0.837136,0.840371,0.834963,0.005520,1
54,0.545095,0.030368,0.078429,0.020055,5.994843,balanced,100,liblinear,"{'clf__C': 5.994842503189409, 'clf__class_weig...",0.824077,0.838748,0.841265,0.834697,0.007579,4
56,0.395367,0.003656,0.053031,0.000466,5.994843,balanced,200,liblinear,"{'clf__C': 5.994842503189409, 'clf__class_weig...",0.824077,0.838748,0.841265,0.834697,0.007579,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,4.821654,0.384989,0.132822,0.022593,0.000010,,500,lbfgs,"{'clf__C': 1e-05, 'clf__class_weight': None, '...",0.014731,0.014771,0.014771,0.014757,0.000019,115
15,0.832771,0.033109,0.125057,0.027081,0.000278,,200,lbfgs,"{'clf__C': 0.0002782559402207126, 'clf__class_...",0.014731,0.014771,0.014771,0.014757,0.000019,115
3,3.759684,0.479518,0.119441,0.012010,0.000010,,200,lbfgs,"{'clf__C': 1e-05, 'clf__class_weight': None, '...",0.014731,0.014771,0.014771,0.014757,0.000019,115
17,1.268630,0.300484,0.134287,0.013652,0.000278,,500,lbfgs,"{'clf__C': 0.0002782559402207126, 'clf__class_...",0.014731,0.014771,0.014771,0.014757,0.000019,115


In [19]:
# Save dataframe into CSV file
df_LR.to_csv("NSF-lr-results.csv")

#### Support Vector Classifier

In [20]:
grid_search_SVC = fit_tuning(X_train, y_train, pipeSVC, paramSVC)

print("\nSVC best params:")
print("  Best Score: ", grid_search_SVC.best_score_)
print("  Best Params: ", grid_search_SVC.best_params_)


SVC best params:
  Best Score:  0.8363562555128728
  Best Params:  {'clf__class_weight': 'balanced', 'clf__tol': 0.05179474679231213}


In [21]:
print("\nCV Results:")
df_SVC = pd.DataFrame(grid_search_SVC.cv_results_)
df_SVC = df_SVC.sort_values(by="mean_test_score", ascending=False)
df_SVC


CV Results:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__class_weight,param_clf__tol,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
19,0.15397,0.001374,0.054468,0.005369,balanced,0.05179475,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.830852,0.841977,0.836239,0.836356,0.004542,1
15,0.340351,0.005214,0.093872,0.007499,balanced,1e-05,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.830852,0.841977,0.835578,0.836136,0.004559,2
18,0.170371,0.008134,0.052261,0.0011,balanced,0.006105402,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.830852,0.841977,0.835578,0.836136,0.004559,2
17,0.18289,0.000891,0.052377,0.001721,balanced,0.0007196857,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.830852,0.841977,0.835578,0.836136,0.004559,2
16,0.234925,0.039212,0.053379,0.002489,balanced,8.483429e-05,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.830852,0.841977,0.835578,0.836136,0.004559,2
20,0.133134,0.001707,0.05099,0.000249,balanced,0.4393971,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.831078,0.84102,0.835383,0.835827,0.004071,6
5,0.132903,0.001893,0.051922,0.001254,,0.4393971,"{'clf__class_weight': None, 'clf__tol': 0.4393...",0.831332,0.832856,0.841523,0.835237,0.004488,7
4,0.151645,0.005567,0.051978,0.002839,,0.05179475,"{'clf__class_weight': None, 'clf__tol': 0.0517...",0.832351,0.83273,0.837477,0.834186,0.002333,8
1,0.194361,0.007212,0.056201,0.004674,,8.483429e-05,"{'clf__class_weight': None, 'clf__tol': 8.4834...",0.831788,0.832188,0.837903,0.833959,0.002793,9
0,0.210597,0.00788,0.061874,0.011025,,1e-05,"{'clf__class_weight': None, 'clf__tol': 1e-05}",0.831788,0.832188,0.837903,0.833959,0.002793,9


In [22]:
# Save dataframe into CSV file
df_SVC.to_csv("NSF-svc-results.csv")

## Corpus: Dmoz-Science

### Load texts

Load texts from the dataset file (corpus)

In [23]:
df, X, y = read_dataset(dest_folder + "/Dmoz-Science.csv")

Raw dataframe loaded.

In [24]:
df

Unnamed: 0,file_name,text,class
0,2786497.txt,Texas A&amp;M Horticultural Extension - Vegeta...,Agriculture
1,2784741.txt,Algaculture Information from Wikipedia on this...,Agriculture
2,2785016.txt,Annual Canarygrass Factsheet on this grain cro...,Agriculture
3,2786133.txt,Black Sigatoka Photographs and information on ...,Agriculture
4,2786989.txt,Irrigation Training and Research Center Univer...,Agriculture
...,...,...,...
5995,2881706.txt,Canmet Energy Technology Center Information ab...,Technology
5996,2886909.txt,Utah Space Association Provides information ab...,Technology
5997,2887110.txt,Mission 51-L Provides a critical view of the r...,Technology
5998,2887157.txt,"New York (SEAoNY) News announcements, committe...",Technology


Count the amount of each class.

In [25]:
print(df["class"].value_counts())

class
Agriculture    500
Astronomy      500
Biology        500
Chemistry      500
Earth          500
Environment    500
Instruments    500
Math           500
Physics        500
Science        500
Social         500
Technology     500
Name: count, dtype: int64


### Data split

Split the corpus into training data and test data.

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1979)

In [27]:
# Show the final size of each set.
print(f"len(X_train): {len(X_train)}")
print(f"len(X_test): {len(X_test)}")

len(X_train): 4800
len(X_test): 1200


### Pipelines

Create pipelines using TF-IDF to create training attribute and add the model.

In [28]:
pipeMNB = Pipeline([("tfidf", TfidfVectorizer()), ("clf", MultinomialNB())])
pipeLR  = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LogisticRegression(random_state=1979))])
pipeSVC = Pipeline([("tfidf", TfidfVectorizer()), ("clf", LinearSVC())])

### GridSearch

For fine tuning we used a pipeline with TF-IDF and three different models (Multinomial Naive Bayes, Logistic Regression and Support Vector Machine). We perform hyperparameter tuning using <code>GridSearchCV</code> with 3-fold cross-validation (<code>cv</code>) and "f1_macro" as the strategy to evaluate the performance of the cross-validated model on the test set (<code>scoring</code>).

Get params for fine tuning of the models.

In [29]:
paramMNB = get_multinomial_naive_bayes_params("clf")
paramLR = get_logistic_regression_params("clf")
paramSVC = get_support_vector_params("clf")

#### Multinomial Naive Bayes

In [30]:
grid_search_MNB = fit_tuning(X_train, y_train, pipeMNB, paramMNB)

print("\nMNB best params:")
print("  Best Score: ", grid_search_MNB.best_score_)
print("  Best Params: ", grid_search_MNB.best_params_)


MNB best params:
  Best Score:  0.7088756661352962
  Best Params:  {'clf__alpha': 0.4111111111111111, 'clf__fit_prior': False, 'clf__force_alpha': True}


In [31]:
print("\nCV Results:")
df_MNB = pd.DataFrame(grid_search_MNB.cv_results_)
df_MNB = df_MNB.sort_values(by="mean_test_score", ascending=False)
df_MNB


CV Results:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__alpha,param_clf__fit_prior,param_clf__force_alpha,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
10,0.094488,0.002792,0.041771,0.001495,0.411111,False,True,"{'clf__alpha': 0.4111111111111111, 'clf__fit_p...",0.716929,0.70306,0.706638,0.708876,0.005879,1
11,0.0974,0.006455,0.041172,0.00023,0.411111,False,False,"{'clf__alpha': 0.4111111111111111, 'clf__fit_p...",0.716929,0.70306,0.706638,0.708876,0.005879,1
8,0.181339,0.006571,0.073399,0.001139,0.411111,True,True,"{'clf__alpha': 0.4111111111111111, 'clf__fit_p...",0.716941,0.702437,0.706638,0.708672,0.006093,3
9,0.134144,0.030256,0.051397,0.014554,0.411111,True,False,"{'clf__alpha': 0.4111111111111111, 'clf__fit_p...",0.716941,0.702437,0.706638,0.708672,0.006093,3
6,0.168149,0.010243,0.070161,0.00142,0.255556,False,True,"{'clf__alpha': 0.25555555555555554, 'clf__fit_...",0.715411,0.704458,0.700324,0.706731,0.006365,5
7,0.172192,0.005143,0.071347,0.002225,0.255556,False,False,"{'clf__alpha': 0.25555555555555554, 'clf__fit_...",0.715411,0.704458,0.700324,0.706731,0.006365,5
4,0.098345,0.003011,0.041402,0.001805,0.255556,True,True,"{'clf__alpha': 0.25555555555555554, 'clf__fit_...",0.715411,0.70375,0.700324,0.706495,0.006458,7
5,0.147209,0.025464,0.073784,0.002507,0.255556,True,False,"{'clf__alpha': 0.25555555555555554, 'clf__fit_...",0.715411,0.70375,0.700324,0.706495,0.006458,7
12,0.095438,0.002032,0.042886,0.002979,0.566667,True,True,"{'clf__alpha': 0.5666666666666667, 'clf__fit_p...",0.71103,0.698221,0.708916,0.706056,0.005607,9
13,0.099678,0.002423,0.044258,0.002926,0.566667,True,False,"{'clf__alpha': 0.5666666666666667, 'clf__fit_p...",0.71103,0.698221,0.708916,0.706056,0.005607,9


In [32]:
# Save dataframe into CSV file
df_MNB.to_csv("dmoz_science-mnb-results.csv")

#### Logistic Regression

In [33]:
grid_search_LR = fit_tuning(X_train, y_train, pipeLR, paramLR)

print("\nLR best params:")
print("  Best Score: ", grid_search_LR.best_score_)
print("  Best Params: ", grid_search_LR.best_params_)


LR best params:
  Best Score:  0.7254091882083982
  Best Params:  {'clf__C': 5.994842503189409, 'clf__class_weight': 'balanced', 'clf__max_iter': 100, 'clf__solver': 'liblinear'}


In [34]:
print("\nCV Results:")
df_LR = pd.DataFrame(grid_search_LR.cv_results_)
df_LR = df_LR.sort_values(by="mean_test_score", ascending=False)
df_LR


CV Results:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_clf__class_weight,param_clf__max_iter,param_clf__solver,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
54,0.325774,0.008744,0.079313,0.001992,5.994843,balanced,100,liblinear,"{'clf__C': 5.994842503189409, 'clf__class_weig...",0.739989,0.710105,0.726134,0.725409,0.012211,1
58,0.465604,0.220890,0.095550,0.020843,5.994843,balanced,500,liblinear,"{'clf__C': 5.994842503189409, 'clf__class_weig...",0.739989,0.710105,0.726134,0.725409,0.012211,1
56,0.326219,0.010340,0.081881,0.003103,5.994843,balanced,200,liblinear,"{'clf__C': 5.994842503189409, 'clf__class_weig...",0.739989,0.710105,0.726134,0.725409,0.012211,1
48,0.321617,0.008670,0.083642,0.008533,5.994843,,100,liblinear,"{'clf__C': 5.994842503189409, 'clf__class_weig...",0.739989,0.709486,0.726134,0.725203,0.012470,4
50,0.566635,0.185312,0.080760,0.001707,5.994843,,200,liblinear,"{'clf__C': 5.994842503189409, 'clf__class_weig...",0.739989,0.709486,0.726134,0.725203,0.012470,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,0.195409,0.007452,0.126040,0.003765,0.000278,,200,lbfgs,"{'clf__C': 0.0002782559402207126, 'clf__class_...",0.357852,0.369180,0.360132,0.362388,0.004892,115
17,0.221908,0.043873,0.122494,0.003405,0.000278,,500,lbfgs,"{'clf__C': 0.0002782559402207126, 'clf__class_...",0.357852,0.369180,0.360132,0.362388,0.004892,115
5,0.335569,0.123965,0.119636,0.003647,0.000010,,500,lbfgs,"{'clf__C': 1e-05, 'clf__class_weight': None, '...",0.347204,0.361747,0.363962,0.357638,0.007433,118
3,0.309130,0.092729,0.109053,0.010279,0.000010,,200,lbfgs,"{'clf__C': 1e-05, 'clf__class_weight': None, '...",0.347204,0.361747,0.363962,0.357638,0.007433,118


In [35]:
# Save dataframe into CSV file
df_LR.to_csv("dmoz_science-lr-results.csv")

#### Support Vector Classifier

In [36]:
grid_search_SVC = fit_tuning(X_train, y_train, pipeSVC, paramSVC)

print("\nSVC best params:")
print("  Best Score: ", grid_search_SVC.best_score_)
print("  Best Params: ", grid_search_SVC.best_params_)


SVC best params:
  Best Score:  0.7293623807251249
  Best Params:  {'clf__class_weight': None, 'clf__tol': 0.4393970560760795}


In [37]:
print("\nCV Results:")
df_SVC = pd.DataFrame(grid_search_SVC.cv_results_)
df_SVC = df_SVC.sort_values(by="mean_test_score", ascending=False)
df_SVC


CV Results:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__class_weight,param_clf__tol,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
5,0.1201,0.003847,0.051961,0.007254,,0.4393971,"{'clf__class_weight': None, 'clf__tol': 0.4393...",0.740809,0.717042,0.730236,0.729362,0.009722,1
0,0.266708,0.05881,0.065059,0.027701,,1e-05,"{'clf__class_weight': None, 'clf__tol': 1e-05}",0.740319,0.715922,0.731218,0.729153,0.010066,2
1,0.175869,0.004766,0.04817,0.002114,,8.483429e-05,"{'clf__class_weight': None, 'clf__tol': 8.4834...",0.740319,0.715922,0.731218,0.729153,0.010066,2
18,0.148957,0.001501,0.047882,0.002297,balanced,0.006105402,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.740319,0.715922,0.731218,0.729153,0.010066,2
17,0.158833,0.003915,0.049147,0.004742,balanced,0.0007196857,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.740319,0.715922,0.731218,0.729153,0.010066,2
16,0.181044,0.004512,0.045938,0.000664,balanced,8.483429e-05,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.740319,0.715922,0.731218,0.729153,0.010066,2
15,0.187543,0.005538,0.046347,0.00204,balanced,1e-05,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.740319,0.715922,0.731218,0.729153,0.010066,2
2,0.156668,0.003031,0.045751,0.000808,,0.0007196857,"{'clf__class_weight': None, 'clf__tol': 0.0007...",0.740319,0.715922,0.731218,0.729153,0.010066,2
3,0.149566,0.007811,0.045865,0.00097,,0.006105402,"{'clf__class_weight': None, 'clf__tol': 0.0061...",0.740319,0.715922,0.731166,0.729135,0.010063,9
19,0.193312,0.041186,0.083254,0.001646,balanced,0.05179475,"{'clf__class_weight': 'balanced', 'clf__tol': ...",0.740319,0.715285,0.731205,0.728936,0.010345,10


In [38]:
# Save dataframe into CSV file
df_SVC.to_csv("dmoz_science-svc-results.csv")