#import

In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import f1_score
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from  sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from collections import Counter
import tensorflow as tf
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt
from keras.optimizers import SGD
from keras.layers import Dense
from keras.models import Sequential
%matplotlib inline

#Google drive

In [2]:
from google.colab import drive 
drive.mount('/gdrive')

Mounted at /gdrive


#Data

In [3]:
names = ['aff', 'variation']

In [4]:
aff = pd.read_csv('/gdrive/My Drive/current_affiliations.tsv', sep='\t', index_col=0, names=names)

In [5]:
aff

Unnamed: 0,aff,variation
0,MPSU,Moscow Pedagogical State University
0,MPSU,"Moscow Pedagogical State University, Moscow, R..."
0,MPSU,MPSU
0,MPSU,"ИВ РАН, МПГУ, Москва, Россия"
0,MPSU,Московский государственный педагогический унив...
...,...,...
10000,Unknown,Хельсинки
10000,Unknown,Чехия
10000,Unknown,Днепропетровск
10000,Unknown,Нью-Йорк


In [6]:
values = aff.aff.value_counts()

In [7]:
rare_aff = aff[aff.aff.isin(values[values<=2].index)]  #  2 и менее вариаций у аффилиаций в датафрейме
double_rare_aff = pd.concat([rare_aff, rare_aff]) # удваиваем датафрейм, чтобы конкатенировать к основному датасету
aff = pd.concat([aff, double_rare_aff]) # теперь единичных вариаций минимум 3, а тех, где было только 2 вариации, стало 6

In [8]:
aff = pd.concat([aff, aff]) # также увеличим в 2 раза весь датасет, чтобы можно было сплитить в нужном соотношении и при этом были представлены все классы

In [9]:
new_ind = dict() # 
i = 0
for index in aff.index:
  if index not in new_ind.keys():
    new_ind[index] = i
    i += 1

old_ind = dict()
for key, value in new_ind.items():
  old_ind[value]=key

aff['new_ind'] = aff.index
aff['new_ind'] = aff.new_ind.map(new_ind)
aff

Unnamed: 0,aff,variation,new_ind
0,MPSU,Moscow Pedagogical State University,0
0,MPSU,"Moscow Pedagogical State University, Moscow, R...",0
0,MPSU,MPSU,0
0,MPSU,"ИВ РАН, МПГУ, Москва, Россия",0
0,MPSU,Московский государственный педагогический унив...,0
...,...,...,...
2543,"Murom Institute of Vladimir State University, ...",Муромский институт (филиал,379
2543,"Murom Institute of Vladimir State University, ...",Муромский институт,379
2544,"JINR Laboratory of High Energy Physics, Dubana...",Институт физики высоких энергий,380
2545,"Center of applied communication, Moscow, Russia",Центр прикладных коммуникаций,381


In [10]:
X = aff.variation
y = aff.new_ind

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, stratify=y_train, test_size=0.25, random_state=42)

In [12]:
len(y_train.unique()), len(y_test.unique()), len(y_val.unique()) # все классы представлены в каждом из датасетов

(385, 385, 385)

#Classifier logreg

In [149]:
vect = CountVectorizer(analyzer='char', ngram_range=(1,4), max_features=1000)
X_train_cv, X_test_cv  = vect.fit_transform(X_train), vect.transform(X_test)

In [150]:
%%time
reg_model = linear_model.LogisticRegression(max_iter=5000)
reg_model.fit(X_train_cv, y_train)

CPU times: user 9min 24s, sys: 6min 21s, total: 15min 46s
Wall time: 8min 35s


In [None]:
y_pred = reg_model.predict(X_test_cv)
cl_rep = classification_report(y_test, y_pred, output_dict=True)

In [159]:
cl_rep['accuracy']

0.9730941704035875

In [160]:
cl_rep['weighted avg']

{'f1-score': 0.9712596259646796,
 'precision': 0.9761144031603675,
 'recall': 0.9730941704035875,
 'support': 892}

In [161]:
cl_rep['macro avg']

{'f1-score': 0.9723856083449645,
 'precision': 0.9742814977230562,
 'recall': 0.9782251837456335,
 'support': 892}

# NN dataset

In [13]:
vocab = Counter()

for af in aff.variation:
    vocab.update(af)

In [14]:
char2id = {'UNK':1, 'PAD':0}

for char in vocab:
    char2id[char] = len(char2id)

In [15]:
id2char = {i:char for char, i in char2id.items()}

In [16]:
X = []

for word in aff.variation:
    ids = np.array([char2id.get(token, 1) for token in word])
    X.append(ids)

In [84]:
MAX_LEN = max(len(x) for x in X)
MAX_LEN

232

In [87]:
def prepare_dataset(data, max=232):
  res = []
  for word in data:
    ids = np.array([char2id.get(token, 1) for token in word])
    res.append(ids)
  res = tf.keras.preprocessing.sequence.pad_sequences(res, maxlen=max)
  return res

In [88]:
X_train_rn = prepare_dataset(X_train)
X_val_rn = prepare_dataset(X_val)
X_test_rn = prepare_dataset(X_test)

y_train_rn = tf.keras.utils.to_categorical(y_train)
y_val_rn = tf.keras.utils.to_categorical(y_val)
y_test_rn = tf.keras.utils.to_categorical(y_test)

In [57]:
X_train_rn.shape, y_train.shape

((2673, 232), (2673,))

# CNN

In [143]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=len(char2id), 
                                    input_length=MAX_LEN, output_dim=10))
model.add(tf.keras.layers.Conv1D(kernel_size=8, filters=4, strides=1))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(600, activation="relu"))
model.add(tf.keras.layers.Dense(385, activation='softmax'))
opt = SGD(lr=0.0005, momentum=0.9)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [59]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 232, 10)           1690      
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 225, 4)            324       
_________________________________________________________________
flatten_8 (Flatten)          (None, 900)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 600)               540600    
_________________________________________________________________
dense_17 (Dense)             (None, 385)               231385    
Total params: 773,999
Trainable params: 773,999
Non-trainable params: 0
_________________________________________________________________


In [144]:
model.fit(X_train_rn, y_train_rn, 
          validation_data=(X_val_rn, y_val_rn),
          batch_size=1,
          epochs=25, 
          callbacks=[callback])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f3ccee03210>

In [None]:
y_pred_rn = model.predict_classes(X_test_rn)
cl_rep = classification_report(y_test, y_pred_rn, output_dict=True)

In [146]:
cl_rep['accuracy']

0.8396860986547086

In [147]:
cl_rep['weighted avg']

{'f1-score': 0.8323198895352768,
 'precision': 0.849427681114571,
 'recall': 0.8396860986547086,
 'support': 892}

In [148]:
cl_rep['macro avg']

{'f1-score': 0.8916653769971935,
 'precision': 0.8967536105351231,
 'recall': 0.9110761942602569,
 'support': 892}

# LSTM

In [128]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=len(char2id), 
                                    input_length=MAX_LEN, output_dim=10))
model.add(tf.keras.layers.LSTM(8, input_shape=(MAX_LEN, len(vocab))))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(600, activation="relu"))
model.add(tf.keras.layers.Dense(385, activation='softmax'))
opt = SGD(lr=0.01, momentum=0.9)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [129]:
model.fit(X_train_rn, y_train_rn, 
          validation_data=(X_val_rn, y_val_rn),
          batch_size=1,
          epochs=25,
          callbacks=[callback])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25


<tensorflow.python.keras.callbacks.History at 0x7f3cd4c93a90>

In [None]:
y_pred_rn = model.predict_classes(X_test_rn)
cl_rep = classification_report(y_test, y_pred_rn, output_dict=True)

In [131]:
cl_rep['accuracy']

0.21188340807174888

In [135]:
cl_rep['weighted avg']

{'f1-score': 0.16380753801660777,
 'precision': 0.17779992911852302,
 'recall': 0.21188340807174888,
 'support': 892}

In [136]:
cl_rep['macro avg']

{'f1-score': 0.15315795864172788,
 'precision': 0.14827972517418644,
 'recall': 0.203209103250829,
 'support': 892}