In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import pandas as pd

In [2]:
data_wiki = pd.read_csv('discofuse_v1/wikipedia/train_balanced.tsv', sep='\t')
data_sports = pd.read_csv('discofuse_v1/sports/train_balanced.tsv', sep='\t')

In [3]:
data_wiki.shape, data_sports.shape

((4490804, 8), (11838624, 8))

In [4]:
data_train = pd.concat([data_wiki, data_sports])
# data_train = data_wiki

In [5]:
data_train = data_train[['coherent_first_sentence', 'coherent_second_sentence', 'discourse_type']]

In [6]:
paired_data_train = data_train[pd.isna(data_train['coherent_second_sentence'])]
data_train = None
data_wiki = None
data_sports = None

In [8]:
paired_X_train = list(paired_data_train['coherent_first_sentence'])
paired_y_train = list(paired_data_train['discourse_type'])

In [11]:
len(paired_X_train), len(paired_y_train)

(12152664, 12152664)

In [12]:
# preprocess
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=200000)
tfidf.fit(paired_X_train)
X = tfidf.transform(paired_X_train)
label_encoder = LabelEncoder().fit(paired_y_train)
y = label_encoder.transform(paired_y_train)

In [13]:
clf = SGDClassifier(max_iter=1000, tol=1e-3, n_jobs=-1)
clf = make_pipeline(StandardScaler(with_mean=False), clf)

In [14]:
clf.fit(X, y)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=False, with_std=True)), ('sgdclassifier', SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', m..._state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [15]:
data_wiki_test = pd.read_csv('discofuse_v1/wikipedia/test_balanced.tsv', sep='\t')
data_sports_test = pd.read_csv('discofuse_v1/sports/test_balanced.tsv', sep='\t')
data_wiki_test.shape, data_sports_test.shape

((44590, 8), (121351, 8))

In [16]:
data_test = pd.concat([data_wiki_test, data_sports_test])
data_test = data_test[['coherent_first_sentence', 'coherent_second_sentence', 'discourse_type']]
paired_data_test = data_test[pd.isna(data_test['coherent_second_sentence'])]
paired_X_test = list(paired_data_test['coherent_first_sentence'])
paired_y_test = list(paired_data_test['discourse_type'])
len(paired_X_test), len(paired_y_test)

(123757, 123757)

In [17]:
label_encoder.inverse_transform([3])

array(['SINGLE_CONN_INNER_ANAPHORA'], dtype='<U26')

In [18]:
X_test = tfidf.transform(paired_X_test)
y_test = label_encoder.transform(paired_y_test)

In [19]:
# test, y_true
y_pred = clf.predict(X_test)

              precision    recall  f1-score   support

           0       0.79      0.76      0.78     13191
           1       0.79      0.76      0.78      6336
           2       0.83      0.89      0.86     27555
           3       0.54      0.42      0.47      6154
           4       0.72      0.72      0.72      4633
           5       0.87      0.91      0.89     16935
           6       0.75      0.80      0.78     26804
           7       0.52      0.34      0.41      6527
           8       0.78      0.77      0.78     15622

   micro avg       0.78      0.78      0.78    123757
   macro avg       0.73      0.71      0.72    123757
weighted avg       0.77      0.78      0.78    123757



In [27]:
import numpy as np
target_strings = label_encoder.inverse_transform(np.arange(9))
print(classification_report(y_test, y_pred, target_names=target_strings))

                            precision    recall  f1-score   support

         SINGLE_APPOSITION       0.79      0.76      0.78     13191
          SINGLE_CATAPHORA       0.79      0.76      0.78      6336
         SINGLE_CONN_INNER       0.83      0.89      0.86     27555
SINGLE_CONN_INNER_ANAPHORA       0.54      0.42      0.47      6154
         SINGLE_CONN_START       0.72      0.72      0.72      4633
           SINGLE_RELATIVE       0.87      0.91      0.89     16935
            SINGLE_S_COORD       0.75      0.80      0.78     26804
   SINGLE_S_COORD_ANAPHORA       0.52      0.34      0.41      6527
           SINGLE_VP_COORD       0.78      0.77      0.78     15622

                 micro avg       0.78      0.78      0.78    123757
                 macro avg       0.73      0.71      0.72    123757
              weighted avg       0.77      0.78      0.78    123757



In [20]:
import joblib

In [21]:
joblib.dump(clf, 'discofuse_sgd_single.joblib')
joblib.dump(tfidf, 'discofuse_tfidf_single.joblib')
joblib.dump(label_encoder, 'discofuse_le_single.joblib')

In [28]:
with open('sgd_single_result.txt', 'w') as f:
    print(classification_report(y_test, y_pred, target_names=target_strings), file=f)