In [5]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize 
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
#загрузка данных

df = pd.read_csv('train.csv')

In [15]:
df['Communication'] = (df['1category'] == 'Communication') | (df['2category'] == 'Communication')
df['Quality'] = (df['1category'] == 'Quality') | (df['2category'] == 'Quality')
df['Price'] = (df['1category'] == 'Price') | (df['2category'] == 'Price')
df['Safety'] = (df['1category'] == 'Safety') | (df['2category'] == 'Safety')

In [19]:
df = df.drop(columns = ['1category', '2category'])

In [20]:
df.to_csv('bebra.csv')

In [21]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [22]:
stopwords_list = stopwords.words('russian')
for word in ['не', 'нет', 'да', 'никогда', 'без', 'хорошо', 'лучше']:
  stopwords_list.remove(word)

In [23]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [24]:
#обработка

def preprocess(string):
  word_tokens = word_tokenize(string.lower())
  return " ".join([lemmatizer.lemmatize(w) for w in word_tokens if not w in stopwords_list and w.isalpha()])

In [25]:
#jxbcnrf текста

df['sentence'] = df['sentence'].map(preprocess)

In [29]:
#gfp,bnbt на тренировочный и валидационный наборы данных

training_data, test_data = train_test_split(df, train_size = 0.7, random_state=42)

In [32]:
#инициализация модели tf-idf

tfidf_transform = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, max_features=30000)

In [33]:
X_train = tfidf_transform.fit_transform(training_data['sentence'])
X_validation = tfidf_transform.transform(test_data['sentence'])
y_train = training_data[['Communication', 'Quality', 'Price', 'Safety']]
y_test = test_data[['Communication', 'Quality', 'Price', 'Safety']]

In [34]:
logit = LogisticRegression()

In [35]:
#обучение
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(logit)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_validation)

In [40]:
y_test 

Unnamed: 0,Communication,Quality,Price,Safety
9672,False,True,False,False
10901,False,True,False,False
19283,True,False,False,False
18280,False,True,False,False
5466,False,False,False,False
...,...,...,...,...
4096,False,False,False,False
2213,True,False,False,False
6234,True,False,False,False
17651,True,False,False,False


In [41]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, clf.predict_proba(X_validation), multi_class='ovr')

0.8226371037818205

In [47]:
data

array([[0.35793901, 0.22163683, 0.02484095, 0.01413242],
       [0.22058164, 0.54176241, 0.03410079, 0.01913691],
       [0.39394269, 0.22345062, 0.03843198, 0.01153791],
       ...,
       [0.31137111, 0.50017986, 0.01500022, 0.01009413],
       [0.32655853, 0.30686921, 0.02174855, 0.01074048],
       [0.65954746, 0.11928661, 0.01953005, 0.01008345]])

In [48]:
y_pred

array([[0, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 0],
       ...,
       [0, 1, 0, 0],
       [0, 0, 0, 0],
       [1, 0, 0, 0]])

In [45]:
data = clf.predict_proba(X_validation)
dataset = pd.DataFrame({'Communication': data[:, 0], 'Quality': data[:, 1], 'Price': data[:, 2], 'Safety': data[:, 4]})

In [68]:
test_df = pd.read_csv('test_for_participants.csv')
X_test = tfidf_transform.transform(test_df['sentence'])

In [69]:
X_test.shape

(2152, 30000)

In [70]:
test_pred = clf.predict(X_test)

In [71]:
len(test_pred)

2152

In [72]:
test_data = clf.predict_proba(X_test)

In [73]:
dataset = pd.DataFrame({'Communication': test_data[:, 0], 'Quality': test_data[:, 1], 'Price': test_data[:, 2], 'Safety': test_data[:, 3]})

In [74]:
dataset

Unnamed: 0,Communication,Quality,Price,Safety
0,0.301038,0.504962,0.018177,0.018783
1,0.409743,0.217172,0.016680,0.011595
2,0.596976,0.236535,0.014084,0.022842
3,0.642198,0.198208,0.017027,0.010032
4,0.172263,0.363616,0.056753,0.014458
...,...,...,...,...
2147,0.220834,0.349715,0.230937,0.011885
2148,0.318443,0.400070,0.017178,0.008167
2149,0.409912,0.323713,0.028449,0.010897
2150,0.343435,0.330666,0.022860,0.015732


In [88]:
output_df = pd.DataFrame(columns = ['sentence', '1category', '1categoryprob', '2category', '2categoryprob'], index=range(2152))

In [89]:
output_df['1category'] = dataset.idxmax(axis=1)
output_df['1categoryprob'] = dataset.max(axis=1)
output_df['2category'] = dataset.mask(dataset.eq(dataset.max(axis=1), axis=0)).idxmax(axis=1)
output_df['2categoryprob'] = dataset.mask(dataset.eq(dataset.max(axis=1), axis=0)).max(axis=1)

In [91]:
output_df['sentence'] = test_df.sentence

In [92]:
output_df

Unnamed: 0,sentence,1category,1categoryprob,2category,2categoryprob
0,"Очень неприятная ситуация, надеюсь, банк либо ...",Quality,0.504962,Communication,0.301038
1,За что выражаю благодарность и банку и данному...,Communication,0.409743,Quality,0.217172
2,"Вывод: информация полученная в смс от банка, и...",Communication,0.596976,Quality,0.236535
3,Хочу по благодарить ее за чуткое отношение к н...,Communication,0.642198,Quality,0.198208
4,"Показал, что я и вклад могу свой пополнять пря...",Quality,0.363616,Communication,0.172263
...,...,...,...,...,...
2147,Верная (по их мнению) ставка 13%.,Quality,0.349715,Price,0.230937
2148,Спасибо Промсвязьбанку за гибкий и человечески...,Quality,0.400070,Communication,0.318443
2149,"Это говорит о том, что обслуживание находится ...",Communication,0.409912,Quality,0.323713
2150,Без платежки ничего не принимают!,Communication,0.343435,Quality,0.330666


In [94]:
output_df.to_csv('categories.csv')