In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import FunctionTransformer
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC

from sklearn.metrics import f1_score

from sklearn.model_selection import GridSearchCV

In [38]:
SEP = ' - '

In [20]:
data = pd.read_csv('data/train.csv').fillna('')
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [23]:
data['input'] = data['text'] + SEP + data['keyword'] + SEP + data['location']
data.head()

Unnamed: 0,id,keyword,location,text,target,input
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada;;
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...


In [24]:
transformer = SentenceTransformer('all-mpnet-base-v2')

In [25]:
sklearn_transformer = FunctionTransformer(lambda x: transformer.encode(x, convert_to_tensor=True, show_progress_bar=False).detach().cpu().numpy())

In [26]:
pipeline = Pipeline(
    [
        ("clf", SVC()),
    ]
)
pipeline

Pipeline(steps=[('clf', SVC())])

In [27]:
param_grid = {
    "clf__kernel": ('linear', 'poly', 'rbf', 'sigmoid'),
    "clf__C": (0.00001, 0.0001, 0.001, 0.01, 1, 10, 100, 1000)
}

In [28]:
grid = GridSearchCV(pipeline, param_grid, scoring='f1')

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    sklearn_transformer.transform(data["input"]), 
    data["target"]
)

In [30]:
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('clf', SVC())]),
             param_grid={'clf__C': (1e-05, 0.0001, 0.001, 0.01, 1, 10, 100,
                                    1000),
                         'clf__kernel': ('linear', 'poly', 'rbf', 'sigmoid')},
             scoring='f1')

In [31]:
y_predicted = grid.predict(X_test)

In [32]:
f1_score(y_test, y_predicted)

0.7888960619754681

In [33]:
grid.best_score_, grid.best_params_, grid.best_estimator_

(0.7852392959674661,
 {'clf__C': 1, 'clf__kernel': 'rbf'},
 Pipeline(steps=[('clf', SVC(C=1))]))

In [34]:
test_data = pd.read_csv('data/test.csv').fillna('')

In [35]:
test_data['input'] = test_data['text'] + SEP + test_data['keyword'] + SEP + test_data['location']

In [36]:
target = grid.predict(sklearn_transformer.transform(test_data["input"]))

In [37]:
output = pd.DataFrame(data = {
    "id": test_data["id"],
    "target": target
})
output.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1


In [39]:
output.to_csv("submission.csv", index=False)

In [40]:
!kaggle competitions submit -c nlp-getting-started -f "submission.csv" -m "Submission using simple SVC and Sbert vectorizer with grid search params"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
zsh:1: command not found: kaggle


In [41]:
!pip3 install kaggle

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Defaulting to user installation because normal site-packages is not writeable
