In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('clean_data.csv')

In [3]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC, LinearSVC

In [4]:
from sklearn.pipeline import Pipeline

In [5]:
model = Pipeline([('vectorize', TfidfVectorizer()),
                  ('cls', SVC(gamma='scale'))])

In [6]:
np.random.seed(42)
df_permuted = df.reindex(np.random.permutation(df.index))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_permuted['clean_text_lemma'],
                                                    df_permuted['class'], test_size=.33, random_state=42)

In [8]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_enc = vectorizer.fit_transform(X_train)
X_test_enc = vectorizer.transform(X_test)

In [9]:
model = SVC(C=10)
model.fit(X_train_enc, y_train)



SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [10]:
y_pred = model.predict(X_test_enc)

In [11]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, model.predict(X_test_enc))

0.4893939393939394

In [12]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(n_jobs=3, random_state=42, solver='lbfgs', verbose=True)
lr.fit(X_train_enc, y_train)
accuracy_score(y_test, lr.predict(X_test_enc))

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:    2.3s finished


0.8166666666666667

In [13]:
vectorizer = TfidfVectorizer(max_features=10000)
X_train_enc = vectorizer.fit_transform(X_train)
X_test_enc = vectorizer.transform(X_test)

In [14]:
svc = LinearSVC()
svc.fit(X_train_enc, y_train)
accuracy_score(y_test, svc.predict(X_test_enc))

0.8515151515151516

In [15]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(n_jobs=3, random_state=42, solver='lbfgs')
lr.fit(X_train_enc, y_train)
accuracy_score(y_test, lr.predict(X_test_enc))

0.8196969696969697

In [16]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(X_train_enc, y_train)
accuracy_score(y_test, rf.predict(X_test_enc))

0.8045454545454546

In [17]:
import lightgbm

gbm = lightgbm.LGBMClassifier(n_estimators=1000, num_leaves=10)
gbm.fit(X_train_enc, y_train)
accuracy_score(y_test, gbm.predict(X_test_enc))

0.8272727272727273

# Results are slightly worse, then without using lemming.

Possible reason is the ambiguity of some words in stemmed form