# Анализ эмоциональной окраски предложений из Twitter - метод 1

Проект по курсу "Анализ неструктурированных данных" кафедры АЯ ВМК МГУ. 

Задача: сделать классификаторы 2умя методами и оценить их

Используемая здесь модель: Sentence BERT + SVM

## Подготовка
### Загрузка моделей, библиотек

In [10]:
!pip install deeppavlov
!pip install torch
!pip install transformers
import torch
import numpy as np
from sklearn.svm import LinearSVC
if torch.cuda.is_available():     
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [1]:
from deeppavlov import configs, build_model
model = build_model(configs.embedder.bert_embedder, download=True)

2020-11-22 22:35:42.64 INFO in 'deeppavlov.download'['download'] at line 138: Skipped http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12_pt.tar.gz download because of matching hashes
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


In [2]:
def get_emb(text):
    _, _, _, _, _, mean_embs, _  = model([text])
    return mean_embs[0]

### Загрузка и подготовка данных

In [3]:
!git clone https://github.com/Samsung-IT-Academy/stepik-dl-nlp.git
!pip install pandas

import pandas as pd

pos_texts = pd.read_csv('stepik-dl-nlp/datasets/bert_sentiment_analysis/positive.csv', encoding='utf8', sep=';', header=None)
neg_texts = pd.read_csv('stepik-dl-nlp/datasets/bert_sentiment_analysis/negative.csv', encoding='utf8', sep=';', header=None)
pos_texts.head()

fatal: destination path 'stepik-dl-nlp' already exists and is not an empty directory.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,408906692374446080,1386325927,pleease_shut_up,"@first_timee хоть я и школота, но поверь, у на...",1,0,0,0,7569,62,61,0
1,408906692693221377,1386325927,alinakirpicheva,"Да, все-таки он немного похож на него. Но мой ...",1,0,0,0,11825,59,31,2
2,408906695083954177,1386325927,EvgeshaRe,RT @KatiaCheh: Ну ты идиотка) я испугалась за ...,1,0,1,0,1273,26,27,0
3,408906695356973056,1386325927,ikonnikova_21,"RT @digger2912: ""Кто то в углу сидит и погибае...",1,0,1,0,1549,19,17,0
4,408906761416867842,1386325943,JumpyAlex,@irina_dyshkant Вот что значит страшилка :D\nН...,1,0,0,0,597,16,23,1


In [8]:
sents = np.concatenate([pos_texts[3].values, neg_texts[3].values])
labels = [1 for _ in range(pos_texts.shape[0])] + [0 for _ in range(neg_texts.shape[0])]
sents[0]

'@first_timee хоть я и школота, но поверь, у нас то же самое :D общество профилирующий предмет типа)'

### Получим векторы предложений для всей выборки

In [11]:
embs = [get_emb(s) for s in sents]

### Разделим и перемешаем

In [12]:
from sklearn.model_selection import train_test_split
from random import shuffle
data = [(sents[i], embs[i], labels[i]) for i in range(len(sents))]
shuffle(data)
train, test = train_test_split(data, test_size = 0.3)

## Обучим SVM на тестовом датасете

In [13]:
classifier = LinearSVC()
classifier.fit(np.array([t[1] for t in train]), np.array([t[2] for t in train]))



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

## Тестирование

In [18]:
test_preds = classifier.predict([t[1] for t in test])
test_labels = [t[2] for t in test]

from sklearn.metrics import precision_score, recall_score, accuracy_score

accuracy = accuracy_score(test_labels, test_preds)
precision = precision_score(test_labels, test_preds)
recall = recall_score(test_labels, test_preds)

In [19]:
accuracy

0.9925056207844117

In [20]:
precision

0.9923907068252177

In [21]:
recall

0.9928503198541118