In [1]:
from pathlib import Path

In [2]:
data_file = Path('D:/junta.txt')

In [3]:
temp_data = list()
with data_file.open('r') as f:
    comments = f.read()
    for i, line in enumerate(comments.split('\n')):
        line = line.strip()
        if line:
            line = line.replace('п»ї', ' ')
            try:
                label, author, text = line.split('|')
            except ValueError as e:
                pass
                print(f'Line number {i}')
                print(line, '\n')
            else:
                label = label.strip()
                author = author.strip()
                text = text.strip()
                if label in ('neutral', 'trolling'):
                    temp_data.append(dict(label=label, author=author, text=text))
                else:
                    print('BAD LABEL')
                    print(f'Line number {i}')
                    print(line, '\n')

In [4]:
class DataFrame:
    def __init__(self, comments):
        self.comments = comments
        
    def __iter__(self):
        for x in self.comments:
            yield x
        
    def _field_by_name(self, name):
        return [x[name] for x in self]
    
    @property
    def labels(self):
        return self._field_by_name('label')
    
    @property
    def authors(self):
        return self._field_by_name('author')
    
    @property
    def texts(self):
        return self._field_by_name('text')

In [5]:
df = DataFrame(temp_data)

In [6]:
from collections import Counter
author_counter = Counter(df.authors)
author_counter.most_common(10)

[('Piscator76', 19),
 ('Berlino10', 18),
 ('Stetschkin', 16),
 ('Fremdhier', 15),
 ('Mandalore', 14),
 ('Pippin', 11),
 ('faktenfaktenfakten', 10),
 ('Waltraud Gundlach', 10),
 ('Heekhof', 9),
 ('Reverend Wicks Cherrycoke', 9)]

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [8]:
data_train, data_test, label_train, label_test = train_test_split(df.texts, df.labels, test_size=0.3, 
                                                                  stratify=df.labels, shuffle=True,
                                                                  random_state=42)

le = LabelEncoder()
le.fit(df.labels)

label_train = le.transform(label_train)
label_test = le.transform(label_test)

In [9]:
class TextCleaner(TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, texts, y=None):
        texts = [self.transform_one(t) for t in texts]
        return texts
    
    def transform_one(self, text):
        text = text.lower()
        text = self.replace_numbers(text)
        return text
    
    def replace_numbers(self, text):
        return re.sub(r'\d+', '000', text)

In [10]:
import regex as re
def my_tokenize(text):
    return re.split(r'[^\p{L}]+', text)

In [11]:
from nltk.corpus import stopwords

In [12]:
features_pipe = Pipeline([
    ('cleaning', TextCleaner()),
    ('counts', TfidfVectorizer(tokenizer=my_tokenize, stop_words=stopwords.words('german'))),
])

pipe = Pipeline([
    ('features', features_pipe),
    ('nb', LogisticRegression()),
])

hyper = {
    'features__counts__analyzer': ['word'],
    'features__counts__ngram_range': [(1, 1), (1, 2), (1, 3), (3, 4), (3, 5), (4, 5), (4, 6)],
    'nb__C': [0.1, 1.0, 10],
}

clf = GridSearchCV(pipe, hyper, scoring='f1', cv=3, refit=True, verbose=2, n_jobs=-1)

In [13]:
clf.fit(data_train, label_train)

print('Best Score', clf.best_score_)
print('Best Params', clf.best_params_)

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:   19.8s finished


Best Score 0.759145935469
Best Params {'features__counts__analyzer': 'word', 'features__counts__ngram_range': (1, 3), 'nb__C': 10}




In [14]:
predict_test = clf.predict(data_test)

In [15]:
report = classification_report(y_true=label_test, y_pred=predict_test)
print(report)

              precision    recall  f1-score   support

           0       0.76      0.41      0.53       147
           1       0.67      0.90      0.77       195

   micro avg       0.69      0.69      0.69       342
   macro avg       0.71      0.66      0.65       342
weighted avg       0.71      0.69      0.67       342



In [16]:
weights = clf.best_estimator_.__dict__['steps'][1][1].coef_[0]

In [17]:
ngrams = clf.best_estimator_.__dict__['steps'][0][1].__dict__['steps'][1][1].get_feature_names()

In [18]:
ngrams_weights = sorted(zip(ngrams, weights), key=lambda x: -abs(x[1]))

In [19]:
for ng, w in ngrams_weights[:20]:
    print(f'{ng:>20} : {w:.2f}')

                krim : 5.38
             ukraine : 3.33
                nato : 2.91
             artikel : -2.91
            russland : 2.83
                kiew : 2.42
              westen : 2.28
                  eu : 2.07
               danke : -1.94
              kosovo : 1.87
              russen : 1.79
          sanktionen : 1.69
              putsch : 1.67
       demonstranten : 1.57
            annexion : 1.56
           regierung : 1.56
          faschisten : 1.52
         deutschland : -1.49
              durfte : 1.38
                 usa : 1.37
