#import

In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import f1_score
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from  sklearn.model_selection import cross_validate

#Google drive

In [2]:
from google.colab import drive 
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


#Data

In [3]:
names = ['ind', 'aff', 'variation']

In [4]:
aff = pd.read_csv('/gdrive/My Drive/current_affiliations.tsv', sep='\t', index_col=0, names=names)

In [5]:
aff

Unnamed: 0_level_0,aff,variation
ind,Unnamed: 1_level_1,Unnamed: 2_level_1
0,MPSU,Moscow Pedagogical State University
0,MPSU,"Moscow Pedagogical State University, Moscow, R..."
0,MPSU,MPSU
0,MPSU,"ИВ РАН, МПГУ, Москва, Россия"
0,MPSU,Московский государственный педагогический унив...
...,...,...
10000,Unknown,Хельсинки
10000,Unknown,Чехия
10000,Unknown,Днепропетровск
10000,Unknown,Нью-Йорк


#Classifier

In [None]:
vect = CountVectorizer(analyzer='char', ngram_range=(1,4))
X = vect.fit_transform(aff['variation'])
y = aff.index
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
reg_model = linear_model.LogisticRegression(max_iter=5000)
reg_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
y_pred = reg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           4       1.00      1.00      1.00         1
           7       0.83      0.83      0.83         6
          10       1.00      1.00      1.00         3
          11       1.00      1.00      1.00         1
          13       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       1.00      1.00      1.00         1
          21       0.00      0.00      0.00         2
          23       0.50      1.00      0.67         1
          25       1.00      1.00      1.00         1
          34       1.00      1.00      1.00         1
          38       1.00      1.00      1.00         1
          39       1.00      1.00      1.00         3
          43       0.00      0.00      0.00         2
          44       1.00      1.00      1.00         1
          48       0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
values = aff.index.value_counts()

In [None]:
len(values[values>2].index)

115

In [None]:
len(values)

385

In [10]:
aff2 = aff[aff.index.isin(values[values>2].index)] # только аффилиации, которые встретились больше 3 раз включительно

In [11]:
vect = CountVectorizer(analyzer='char', ngram_range=(1,4))
X = vect.fit_transform(aff2['variation'])
y = aff2.index
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [12]:
reg_model = linear_model.LogisticRegression(max_iter=5000)
reg_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=5000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
y_pred = reg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           4       1.00      1.00      1.00         1
           7       1.00      0.71      0.83         7
          11       1.00      1.00      1.00         1
          14       1.00      1.00      1.00         1
          15       0.67      1.00      0.80         2
          16       1.00      1.00      1.00         1
          17       1.00      1.00      1.00         1
          18       1.00      0.50      0.67         2
          22       1.00      1.00      1.00         1
          23       1.00      1.00      1.00         1
          24       1.00      1.00      1.00         1
          39       1.00      1.00      1.00         1
          42       1.00      1.00      1.00         1
          50       0.00      0.00      0.00         1
          63       1.00      1.00      1.00         1
          71       1.00      1.00      1.00         1
          73       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# NN

In [6]:
!pip install pandas scikit-learn matplotlib



In [7]:
from sklearn.metrics import accuracy_score
from collections import Counter
import tensorflow as tf
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt
from keras.optimizers import SGD
from keras.layers import Dense
from keras.models import Sequential
%matplotlib inline

In [8]:
vocab = Counter()

for af in aff.variation:
    vocab.update(af)

In [9]:
char2id = {'UNK':1, 'PAD':0}

for char in vocab:
    char2id[char] = len(char2id)

In [10]:
id2char = {i:char for char, i in char2id.items()}

In [70]:
X = []

for word in aff.variation:
    ids = np.array([char2id.get(token, 1) for token in word])
    X.append(ids)

In [76]:
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_LEN)

In [80]:
new_ind = dict()
i = 0
for index in aff.index:
  if index not in new_ind.keys():
    new_ind[index] = i
    i += 1

old_ind = dict()
for key, value in new_ind.items():
  old_ind[value]=key

aff['new_ind'] = aff.index
aff['new_ind'] = aff.new_ind.map(new_ind)
aff

Unnamed: 0_level_0,aff,variation,new_ind
ind,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,MPSU,Moscow Pedagogical State University,0
0,MPSU,"Moscow Pedagogical State University, Moscow, R...",0
0,MPSU,MPSU,0
0,MPSU,"ИВ РАН, МПГУ, Москва, Россия",0
0,MPSU,Московский государственный педагогический унив...,0
...,...,...,...
10000,Unknown,Хельсинки,384
10000,Unknown,Чехия,384
10000,Unknown,Днепропетровск,384
10000,Unknown,Нью-Йорк,384


In [21]:
MAX_LEN = max(len(x) for x in X)
MAX_LEN

232

In [22]:
len(char2id)

169

In [81]:
y = tf.keras.utils.to_categorical(aff.new_ind)

In [83]:
X.shape, y.shape

((1556, 232), (1556, 385))

In [98]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=len(char2id), 
                                    input_length=MAX_LEN, output_dim=100))
model.add(tf.keras.layers.Conv1D(kernel_size=8, filters=4, strides=1))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(600, activation="relu"))
model.add(tf.keras.layers.Dense(385, activation='softmax'))
opt = SGD(lr=0.01, momentum=0.9)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [89]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 232, 100)          16900     
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 225, 4)            3204      
_________________________________________________________________
flatten_5 (Flatten)          (None, 900)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 600)               540600    
_________________________________________________________________
dense_11 (Dense)             (None, 385)               231385    
Total params: 792,089
Trainable params: 792,089
Non-trainable params: 0
_________________________________________________________________


In [99]:
model.fit(X, y, 
          validation_split=0.25,
          batch_size=128,
          epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7f955326dad0>