In [1]:
# Install and update spaCy 
!pip install -U spacy

# Download the english language model
!python -m spacy download en

Collecting spacy
  Downloading spacy-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m131.6 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-legacy<3.1.0,>=3.0.11
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting pathy>=0.10.0
  Downloading pathy-0.10.1-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 KB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy-legacy, pathy, spacy
  Attempting uninstall: spacy-legacy
    Found existing installation: spacy-legacy 3.0.10
    Not uninstalling spacy-legacy at /shared-libs/python3.9/py/lib/python3.9/site-packages, outside environment /root/venv
    Can't uninstall 'spacy-legacy'. No files were found to uninstall.
  Attempting uninstall: pathy
    Found existing installation: pathy 0.6.2
    Not uninstalling pathy at /shared-libs/python3.9/py

In [2]:
import os
import csv
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

In [3]:
#Load the df generated in PrepareData
df = pd.read_csv('df.csv')
del df[df.columns[0]]

df

Unnamed: 0,Filename,CGMech,content
0,ABB Group Annual Report 2015_English-2.txt,8,\n \n The ABB Group Annual Report 2015 \n \n \...
1,ABB_02.txt,8,ABB Group Annual Report 2002\n \n Financial re...
2,ABB_03.txt,8,Important information regarding the ABB 2003 A...
3,ABB_04.txt,8,ABB Annual Report 2004 \n \n Financial review ...
4,ABB_05.pdf.txt,9,i\n \n D\n e\n s\n g\n n\n e\n d\n \n \n \n b...
...,...,...,...
1694,WISeKey_2017.pdf.txt,9,ANNUAL REPORT 2017\n \n O I N \n \n - DIGI...
1695,WiSeKey-Annual-Report-2018.pdf.txt,9,Annual Report 2018\n \n ARCHITECTING A WISER W...
1696,ZurRoseGroup-2017-EN-Vollbericht.pdf.txt,9,Annual Report 2017\n \n Zur Rose Group \n \n ...
1697,ZurRoseGroup-2018-EN-Annual-Report.pdf.txt,9,Annual Report 2018\n \n Zur Rose Group \n \n ...


In [4]:
#Group CGMech by categories: 'low', 'middle', 'high'
def categorize(value):
    value=int(value)
    if value >= 1 and value <= 5:
        return 'low'
    elif value >= 6 and value <= 8:
        return 'middle'
    elif value >= 9 and value <= 10:
        return 'high'
    else:
        return 'unknown'

# apply function to create new "cat" column
df['cat'] = df['CGMech'].apply(categorize)

print(df)


                                        Filename  CGMech  \
0     ABB Group Annual Report 2015_English-2.txt       8   
1                                     ABB_02.txt       8   
2                                     ABB_03.txt       8   
3                                     ABB_04.txt       8   
4                                 ABB_05.pdf.txt       9   
...                                          ...     ...   
1694                        WISeKey_2017.pdf.txt       9   
1695          WiSeKey-Annual-Report-2018.pdf.txt       9   
1696    ZurRoseGroup-2017-EN-Vollbericht.pdf.txt       9   
1697  ZurRoseGroup-2018-EN-Annual-Report.pdf.txt       9   
1698  ZurRoseGroup-2019-EN-Annual-Report.pdf.txt       9   

                                                content     cat  
0     \n \n The ABB Group Annual Report 2015 \n \n \...  middle  
1     ABB Group Annual Report 2002\n \n Financial re...  middle  
2     Important information regarding the ABB 2003 A...  middle  
3     ABB Annua

In [5]:
#visualize categories
cat_counts = df['cat'].value_counts()

print(cat_counts)
cat_percentages = cat_counts / len(df) * 100
print('Total: ', len(df))
print()
cat_percentages=cat_percentages.round(0)

print("Percentage:")
print(cat_percentages)


high      912
middle    645
low       142
Name: cat, dtype: int64
Total:  1699

Percentage:
high      54.0
middle    38.0
low        8.0
Name: cat, dtype: float64


In [6]:
#Decision: (uncomment ## to choose)
# - Train the model on CGMech proxy [1-10]
##ylabels = df.CGMech

# OR
# - Train the model on cat ['low','middle','high']
ylabels = df.cat

X = df.content

#split dataset: 80%train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=1232)



In [7]:
#Baserate
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(None, y_train)
baserate = dummy.score(None, y_test)
baserate

0.5558823529411765

In [8]:
#Select classifier (uncomment ## selection)

classifier=LogisticRegression()
##classifier = RandomForestClassifier()
##classifier = SVC(kernel='linear', C=1, probability=True)

In [23]:
import spacy

#tokenize string (content of .txt doc) to word tokens
def spacyTokens(SpacyString):
    sp = spacy.load('en_core_web_sm')  #load English language model
    sp.max_length=6000000 #increase max length of document (len(logitech09.pdf.txt) is over 5M)
    doc = sp(SpacyString)
    cleaned_tokens = []
    for token in doc:
        if not token.is_stop and token.is_alpha and not token.is_digit and not token.is_punct:
            cleaned_token = token.lemma_.lower().strip()
            if cleaned_token:
                cleaned_tokens.append(cleaned_token)
    return cleaned_tokens

#Define pipeline
tfidf_vector = TfidfVectorizer(tokenizer=spacyTokens)
pipe = Pipeline([('vectorizer', tfidf_vector), ('classifier', classifier)])


In [24]:
print("fitting...")
pipe.fit(X_train, y_train)

fitting...


In [35]:
# Extract tokens

version=1
treshold=0.01 #tokens with coef under will be ignored from top_tokens

vectorizer = pipe.named_steps['vectorizer']
classifier = pipe.named_steps['classifier']
feature_names = vectorizer.get_feature_names()
coefs = classifier.coef_[0]

top_positive_tokens = [(feature_names[i], round(coefs[i],2)) for i in range(len(feature_names)) if coefs[i] > treshold]
top_negative_tokens = [(feature_names[i], round(coefs[i],2)) for i in range(len(feature_names)) if coefs[i] < -treshold]
all_tokens = [(feature_names[i], round(coefs[i],2)) for i in range(len(feature_names))]
print("Top positive tokens:", top_positive_tokens)
print("Top negative tokens:", top_negative_tokens)



Top positive tokens: [('aaa', 0.02), ('aachen', 0.03), ('aaland', 0.01), ('aatb', 0.02), ('ab', 0.02), ('abbvie', 0.02), ('abcdefg', 0.05), ('abcp', 0.04), ('abdelhamid', 0.02), ('abegg', 0.04), ('abend', 0.01), ('abgeschlossene', 0.02), ('abicipar', 0.12), ('abidjan', 0.01), ('abingworth', 0.06), ('ablation', 0.01), ('abolish', 0.02), ('abolition', 0.01), ('abril', 0.01), ('abrogation', 0.01), ('absence', 0.04), ('absent', 0.02), ('absentee', 0.01), ('absolute', 0.03), ('absorber', 0.03), ('abstention', 0.01), ('abuse', 0.01), ('abzugeben', 0.03), ('ac', 0.08), ('aca', 0.01), ('acacia', 0.03), ('academy', 0.05), ('accarda', 0.04), ('accelerate', 0.04), ('accelerated', 0.06), ('accelerator', 0.02), ('accell', 0.04), ('access', 0.24), ('accessarena', 0.01), ('accessible', 0.02), ('accessibleroche', 0.01), ('accident', 0.1), ('accommodation', 0.01), ('accompanying', 0.01), ('accordance', 0.07), ('accordingly', 0.02), ('accordion', 0.01), ('accountable', 0.02), ('accredit', 0.01), ('accre

In [37]:
# Write top positive tokens to CSV
with open('top_positive_tokens_'+str(version)+'.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Token', 'Coefficient'])
    for token, coef in top_positive_tokens:
        writer.writerow([token, coef])

# Write top negative tokens to CSV
with open('top_negative_tokens_'+str(version)+'.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Token', 'Coefficient'])
    for token, coef in top_negative_tokens:
        writer.writerow([token, coef])

# Write all tokens to CSV
with open('all_tokens_'+str(version)+'.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Token', 'Coefficient'])
    for token, coef in all_tokens:
        writer.writerow([token, coef])

In [27]:
#Test on the remaining 20% of the dataset: 
#make the model predict and compare with real

print('making predictions..')
y_pred=pipe.predict(X_test)

making predictions..


In [28]:
def evaluate(true, pred):
    precision = precision_score(true, pred, average='weighted')
    recall = recall_score(true, pred, average='weighted')
    f1 = f1_score(true, pred, average='weighted')
    print(f"CONFUSION MATRIX:\n{confusion_matrix(true, pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(true, pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n\tPrecision: {precision:.4f}\n\tRecall: {recall:.4f}\n\tF1_Score: {f1:.4f}")

    
evaluate(y_test, y_pred)

CONFUSION MATRIX:
[[170   0  19]
 [  4  11  10]
 [ 51   1  74]]
ACCURACY SCORE:
0.7500
CLASSIFICATION REPORT:
	Precision: 0.7536
	Recall: 0.7500
	F1_Score: 0.7397


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=961bc5f7-68db-4917-95e0-61c59b88476b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>