## Model Training

In [1]:
## All imports
import tensorflow as tf
import pandas as pd
import nltk
nltk.download('punkt')
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

tf.config.run_functions_eagerly(True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ravih\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
files = pd.read_csv('player_news_set.csv', converters={"news_file": lambda x: x.strip("[]").replace("'","").replace('\\\\', '\\').split(", ")}).drop(['Unnamed: 0'], axis = 1)
files

Unnamed: 0,player_name,news_file,Transfered
0,Abraham,[D:\Academics\UMN-MSBA\Term 3\MSBA 6460 - Adva...,1
1,Ake,[D:\Academics\UMN-MSBA\Term 3\MSBA 6460 - Adva...,0
2,Allan,[D:\Academics\UMN-MSBA\Term 3\MSBA 6460 - Adva...,1
3,Alli,[D:\Academics\UMN-MSBA\Term 3\MSBA 6460 - Adva...,0
4,Ampadu,[D:\Academics\UMN-MSBA\Term 3\MSBA 6460 - Adva...,1
...,...,...,...
138,Wright,[D:\Academics\UMN-MSBA\Term 3\MSBA 6460 - Adva...,1
139,Xhaka,[D:\Academics\UMN-MSBA\Term 3\MSBA 6460 - Adva...,0
140,Zaha,[D:\Academics\UMN-MSBA\Term 3\MSBA 6460 - Adva...,0
141,Ziyech,[D:\Academics\UMN-MSBA\Term 3\MSBA 6460 - Adva...,0


In [3]:
files['Transfered'].value_counts()

0    93
1    50
Name: Transfered, dtype: int64

## Collecting training set and getting the articles

In [4]:
def getAllNews(pathList):
    news = ''
    for i in pathList:
        with open(i) as f:
            lines = f.readlines()[0]
        news += ' ' + lines
    return news

In [5]:
files['agg_news'] = files['news_file'].apply(getAllNews)
files = files.drop(['news_file'], axis = 1)
files

Unnamed: 0,player_name,Transfered,agg_news
0,Abraham,1,The west London giants have accumulated 12 po...
1,Ake,0,The Spaniard has hardly played for the Cityze...
2,Allan,1,The 33-year-old has not played a single minut...
3,Alli,0,PSG's newly appointed manager Mauricio Pochet...
4,Ampadu,1,The Welshman spent the previous season on loa...
...,...,...,...
138,Wright,1,The Gunners wanted to sign both Aouar and Tho...
139,Xhaka,0,Arsenal have looked strong in the opening few...
140,Zaha,0,Arsenal pushed through a deal to sign struggl...
141,Ziyech,0,The duo picked up hamstring injuries during t...


## Modeling

## Train-test Split

In [6]:
training_X, test_X, training_y, test_y = train_test_split(files[['player_name', 'agg_news']], 
                                                          files['Transfered'],
                                                          random_state = 420,
                                                          test_size = 30, 
                                                          stratify = files['Transfered'])

In [7]:
training_y_2col = []
for i in training_y:
    if i == 0:
        training_y_2col.append([1, 0])
    elif i == 1:
        training_y_2col.append([0, 1])
training_y_2col = np.array(training_y_2col)

testing_y_2col = []
for i in test_y:
    if i == 0:
        testing_y_2col.append([1, 0])
    elif i == 1:
        testing_y_2col.append([0, 1])
testing_y_3col = np.array(testing_y_2col)

### Setting up the model

In [8]:
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig
import tensorflow_hub as hub
import tensorflow_text as text



In [9]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [30]:
def buildBERTModel():
    model_bert = tf.keras.Sequential()

    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(bert_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(bert_encoder, trainable=False, name='BERT_encoder')
    
    outputs = encoder(encoder_inputs)
    
    net = outputs['sequence_output']
    net = tf.keras.layers.LSTM(64, name = 'LSTM')(net)
    net = tf.keras.layers.Dropout(0.5)(net)
    net = tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l1_l2(.01))(net)
    net = tf.keras.layers.Dense(2, activation='sigmoid', name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [31]:
bert_model = buildBERTModel()    
bert_model.compile(loss = tf.keras.losses.BinaryCrossentropy(),
                       optimizer='adam',
                       metrics=['accuracy'])

In [32]:
bert_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                

In [40]:
bert_model.fit(x = training_X['agg_news'], y = training_y_2col, epochs=5, batch_size = 1)

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1bb01da9ee0>

## Predictions

In [46]:
predictions = bert_model.predict(x = test_X['agg_news'])



In [52]:
f1_score(test_y, np.argmax(predictions, axis = 1), average = 'macro')

0.6527777777777779

In [48]:
accuracy_score(test_y, np.argmax(predictions, axis = 1))

0.6666666666666666

In [49]:
precision_score(test_y, np.argmax(predictions, axis = 1))

0.5

In [50]:
recall_score(test_y, np.argmax(predictions, axis = 1))

0.7