In [22]:
import os
import pandas as pd
import csv
import string
import nltk
import numpy as np
import copy
import pickle
import re
import math
from IPython.display import display, HTML

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from num2words import num2words

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [31]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/vlad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/vlad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vlad/nltk_data...


True

In [10]:
print(os.listdir('data/isw'))

['reports_processed.csv', 'reports.csv']


In [11]:
data_path = 'data/isw/reports_processed.csv'

In [12]:
data = pd.read_csv(data_path, on_bad_lines='skip', sep=';')

In [13]:
display(data.head())
display(data.info())

Unnamed: 0,date,url,page_title,content_title,content_html,content_text
0,2022-02-25,https://www.understandingwar.org/backgrounder/...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"<div class=""field field-name-body field-type-t...",\nRussian forces entered major Ukrainian citie...
1,2022-02-26,https://www.understandingwar.org/backgrounder/...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"<div class=""field field-name-body field-type-t...",\nRussian forces’ main axes of advance in the ...
2,2022-02-27,https://www.understandingwar.org/backgrounder/...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"<div class=""field field-name-body field-type-t...",\nThe Russian military has likely recognized t...
3,2022-02-28,https://www.understandingwar.org/backgrounder/...,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...","<div class=""field field-name-body field-type-t...",\nThe Russian military is reorganizing its mil...
4,2022-03-01,https://www.understandingwar.org/backgrounder/...,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1","<div class=""field field-name-body field-type-t...",\nRussian forces are completing the reinforcem...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332 entries, 0 to 331
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date           332 non-null    object
 1   url            332 non-null    object
 2   page_title     332 non-null    object
 3   content_title  332 non-null    object
 4   content_html   332 non-null    object
 5   content_text   332 non-null    object
dtypes: object(6)
memory usage: 15.7+ KB


None

In [14]:
content = data.iloc[1]['content_text']
HTML(content)

In [36]:
def remove_one_letter_word(data):
    words = word_tokenize(str(data))
    
    new_text = ""
    for word in words:
        if len(word) > 1:
            new_text = new_text + " " + word
    return new_text

def convert_lower_case(data):
    return np.char.lower(data)

def remove_stop_words(data):
    stop_words = set(stopwords.words('english'))
    stop_stop_words = {"no", "not"}
    stop_words = stop_words - stop_stop_words
    
    words = word_tokenize(str(data))
    
    new_text = ""
    for word in words:
        if word not in stop_words and len(word) > 1:
            new_text = new_text + " " + word
    return new_text 

def remove_punctuation(data):
    symbols = "!\"#$%^&*—()_-=+@:;?<>`{|}[\]~\n"
    
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")


In [37]:
def stemming(data):
    stemmer = PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for token in tokens:
        new_text = new_text + " " + stemmer.stem(token)
    return new_text

def lemmatizing(data):
    lemmatizer = WordNetLemmatizer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for token in tokens:
        new_text = new_text + " " + lemmatizer.lemmatize(token)
    return new_text

In [38]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for token in tokens:
        if token.isdigit():
            if int(token) < 100000000000:
                token = num2words(token)
            else:
                token = ''
        new_text = new_text + " " + token
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

def remove_url_from_string(data):
    words = word_tokenize(str(data))
    
    new_text = ""
    for word in words:
        word = re.sub(r'^https?:\/\/.*[\r\n]*', '', str(word), flags=re.MULTILINE)
        word = re.sub(r'^http?:\/\/.*[\r\n]*', '', str(word), flags=re.MULTILINE)
        
        new_text = new_text + " " + word
    return new_text

In [39]:
def preprocess(data, word_root_algo="lemm"):
    data = remove_one_letter_word(data)
    data = remove_url_from_string(data)
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    
    if word_root_algo == "lemm":
        data = lemmatizing(data)
    else:
        data = stemming(data)
        
    data = remove_punctuation(data)
    data = remove_stop_words(data)
    
    return data

In [40]:
data['content_text_lemm'] = data['content_text'].apply(lambda x: preprocess(x, "lemm"))

In [41]:
data['content_text_stem'] = data['content_text'].apply(lambda x: preprocess(x, "stem"))

In [42]:
data.head(5)

Unnamed: 0,date,url,page_title,content_title,content_html,content_text,content_text_lemm,content_text_stem
0,2022-02-25,https://www.understandingwar.org/backgrounder/...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"<div class=""field field-name-body field-type-t...",\nRussian forces entered major Ukrainian citie...,russian forc enter major ukrainian citi inclu...,russian forc enter major ukrainian citi inclu...
1,2022-02-26,https://www.understandingwar.org/backgrounder/...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"<div class=""field field-name-body field-type-t...",\nRussian forces’ main axes of advance in the ...,russian forc main axe advanc last twenti four...,russian forc main axe advanc last twenti four...
2,2022-02-27,https://www.understandingwar.org/backgrounder/...,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"<div class=""field field-name-body field-type-t...",\nThe Russian military has likely recognized t...,russian militari like recogn initi expect lim...,russian militari like recogn initi expect lim...
3,2022-02-28,https://www.understandingwar.org/backgrounder/...,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...","<div class=""field field-name-body field-type-t...",\nThe Russian military is reorganizing its mil...,russian militari reorgan militari effort atte...,russian militari reorgan militari effort atte...
4,2022-03-01,https://www.understandingwar.org/backgrounder/...,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1","<div class=""field field-name-body field-type-t...",\nRussian forces are completing the reinforcem...,russian forc complet reinforc resuppli troop ...,russian forc complet reinforc resuppli troop ...


In [43]:
docs = data['content_text_lemm'].tolist()


In [44]:
len(docs)

332

In [45]:
cv = CountVectorizer(max_df=0.98, min_df=2)
word_count_vector = cv.fit_transform(docs)

word_count_vector.shape

(332, 6524)

In [49]:
with open("./data/isw/count_vectorizer_v1.pkl", 'wb') as handle:
    pickle.dump(cv, handle)

In [51]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [53]:
with open("./data/isw/tfidf_transformer_v1.pkl", 'wb') as handle:
    pickle.dump(tfidf_transformer, handle)

In [54]:
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(), columns=["idf_weights"])

df_idf.sort_values(by=["idf_weights"])

Unnamed: 0,idf_weights
attack,1.021245
war,1.021245
luhansk,1.021245
arm,1.024317
posit,1.024317
...,...
jahongir,5.709530
starodubivka,5.709530
jail,5.709530
kashlyhach,5.709530


In [55]:
tf_idf_vector = tfidf_transformer.transform(word_count_vector)