In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from os import path

## 1. 

In [2]:
texts = []
labels = []

for filename in [ path.join('sentiment labelled sentences', fn) for fn in 
                 [ 'amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt' ] ]:
    with open(filename) as f:
        for line in f:
            text, label = line.rstrip().split('\t')
            texts.append(text)
            labels.append(int(label))

In [3]:
len(texts), len(labels)

(3000, 3000)

In [4]:
texts[:3], labels[:3]

(['So there is no way for me to plug it in here in the US unless I go by a converter.',
  'Good case, Excellent value.',
  'Great for the jawbone.'],
 [0, 1, 1])

## 2.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

_(1\*)_ _Тут не совсем понятно, что именно надо сплитить: исходный массив текстов, либо же результат `TfidfVectorizer` на нём? (опять же, как потом строить визуализацию PCA / t-SNE, мы же им текст не скормим?)._

In [6]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, train_size=0.8)

## 3.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

from mlxtend.preprocessing import DenseTransformer

In [8]:
model = Pipeline(
[
    ('vect', TfidfVectorizer()),
    ('dense', DenseTransformer()),
    ('svm', SVC(kernel='linear'))
])

In [9]:
model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [10]:
y_pred = model.predict(X_test)

In [11]:
accuracy_score(y_test, y_pred)

0.8266666666666667

## 4.

In [12]:
vectorizer = model.steps[0][1]
svm = model.steps[2][1]

In [13]:
feature_importances = list(sorted(zip(vectorizer.get_feature_names(), svm.coef_.squeeze()), key=lambda x: -abs(x[1])))
pd.DataFrame(feature_importances, columns=['feature_name', 'feature_importance'])[:12]

Unnamed: 0,feature_name,feature_importance
0,not,-4.289733
1,great,3.676624
2,good,3.179028
3,bad,-2.900871
4,love,2.444754
5,worst,-2.418351
6,nice,2.263833
7,loved,2.230092
8,delicious,2.227952
9,poor,-2.192395


## 5.

### 5.1 _Cross-validation_ 

In [14]:
from sklearn.model_selection import GridSearchCV

_(2\*)_ _Мы ведь подбираем параметры `SVC`, значит, наверное, нет необходимости делать кросс-валидацию для всего `Pipeline`, можно было бы один раз применить `TfidfVectorizer` к_ `X_train`_, а потом уже с её результатом запускать CV для `SVC`?_

In [15]:
model = Pipeline(
[
    ('vect', TfidfVectorizer()),
    ('dense', DenseTransformer()),
    ('svm', SVC())
])

svc_grid_params = {
    'svm__C' : [0.1, 0.5, 1.0, 10., 100.],
    'svm__kernel' : ['linear', 'poly', 'rbf']
}

In [None]:
gs = GridSearchCV(model, svc_grid_params, scoring='accuracy', cv=5, n_jobs=8)
gs.fit(X_train, y_train)

In [18]:
gs.best_estimator_.steps[2][1]

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
y_pred = gs.best_estimator_.predict(X_test)

print('[SVC] accuracy for best (train): {:.5f}'.format(gs.best_score_))
print('[SVC] accuracy for best (test): {:.5f}'.format(accuracy_score(y_pred, y_test)))
print('*' * 40)

[SVC] accuracy for best (train): 0.81458
[SVC] accuracy for best (test): 0.82667
****************************************


### 5.2 _Learning curve_

...

## 6. 

...

## 7. _PCA_

_(3\*) Пока не совсем понял, что именно надо передавать в PCA: `texts` ведь в него не передашь (см. (1\*)). По идее, надо передавать выход `TfidfVectorizer`? Но тогда `TfidfVectorizer` надо делать один раз в начале для всего `texts`, но правильно ли отделять `TfidfVectorizer` от `SVC` (он ведь даст другой результат для_ `X_train`_?_