In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('enh_data.csv')

In [5]:
np.random.seed(42)
pdf = df.reindex(np.random.permutation(df.index))

In [15]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion

In [8]:
T_train, T_test, X_train, X_test, y_train, y_test = train_test_split(pdf['text_lem_cor'], pdf.loc[:, ['sentiment', 'subjectivity']], pdf['class'],
                                                                    random_state=42, test_size=.33)

In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=50000)
T_train_enc = vectorizer.fit_transform(T_train)
T_test_enc = vectorizer.transform(T_test)

In [21]:
lr = LogisticRegression(n_jobs=3, random_state=42, solver='lbfgs', verbose=True)
lr.fit(X_train, y_train)
accuracy_score(y_test, lr.predict(X_test))

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:    0.4s finished


0.7166666666666667

In [51]:
X_train_comb = np.hstack([X_train.values, T_train_enc.toarray()])

In [52]:
X_test_comb = np.hstack([X_test.values, T_test_enc.toarray()])

In [54]:
svc = LinearSVC()
svc.fit(T_train_enc, y_train)
accuracy_score(y_test, svc.predict(T_test_enc))

0.8469696969696969

In [55]:
svc = LinearSVC()
svc.fit(X_train_comb, y_train)
accuracy_score(y_test, svc.predict(X_test_comb))

0.8469696969696969

In [59]:
lr = LogisticRegression(n_jobs=3, random_state=42, solver='lbfgs', verbose=True)
lr.fit(T_train_enc, y_train)
accuracy_score(y_test, lr.predict(T_test_enc))

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:    0.9s finished


0.8196969696969697

In [58]:
lr = LogisticRegression(n_jobs=3, random_state=42, solver='lbfgs', verbose=True)
lr.fit(X_train_comb, y_train)
accuracy_score(y_test, lr.predict(X_test_comb))

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   1 out of   1 | elapsed:   29.3s finished


0.8106060606060606

In [66]:
vectorizer = TfidfVectorizer(max_features=10000)
X_train_enc = vectorizer.fit_transform(X_train)
X_test_enc = vectorizer.transform(X_test)

In [67]:
svc = LinearSVC()
svc.fit(X_train_enc, y_train)
accuracy_score(y_test, svc.predict(X_test_enc))

0.8545454545454545

In [59]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(n_jobs=3, random_state=42, solver='lbfgs')
lr.fit(X_train_enc, y_train)
accuracy_score(y_test, lr.predict(X_test_enc))

0.8287878787878787

In [69]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(X_train_enc, y_train)
accuracy_score(y_test, rf.predict(X_test_enc))

0.8378787878787879

In [75]:
import lightgbm

gbm = lightgbm.LGBMClassifier(n_estimators=1000, num_leaves=10)
gbm.fit(X_train_enc, y_train)
accuracy_score(y_test, gbm.predict(X_test_enc))

0.8318181818181818