In [None]:
# Install and update spaCy 
!pip install -U spacy

# Download the english language model
!python -m spacy download en

!pip install -U openpyxl

Collecting spacy
  Downloading spacy-3.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
Collecting pathy>=0.10.0
  Downloading pathy-0.10.1-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 KB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting thinc<8.2.0,>=8.1.8
  Downloading thinc-8.1.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (924 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m924.9/924.9 KB[0m [31m86.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy-legacy<3.1.0,>=3.0.11
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Installing collected packages: spacy-legacy, pathy, thinc, spacy
  Attempting uninstall: spacy-legacy
    Found existing installation: spacy-legacy 3.0.10
    Not uninstalling spacy-legacy at /shared-libs/python3.9/py/lib

In [2]:
import os
import csv
import pandas as pd
import numpy as np
import math

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [3]:
version = 17

#Settings: remove # to select, add # to unselect

## Select classifier

###Logistic Regression
#classifier=LogisticRegression()
###Random Forest Classifier
#classifier = RandomForestClassifier()
###Linear Support Vector Classification
classifier = LinearSVC()
#

## Ignore tokens? 

###No
IgnoreLimit = 0
###Yes, ignore token with coef under 0.05
#IgnoreLimit = 0.005
#

## Ignore most frequent tokens?

###No
#max_df=1
###Ignore 0.5% most frequent tokens
#max_df=0.995
#



In [4]:
#Load the df generated in PrepareData
df = pd.read_csv('NewDf.csv')
del df[df.columns[0]]

df

Unnamed: 0,Filename,CGMech,content
0,ABB Group Annual Report 2015_English-2.txt,8,\n \n The ABB Group Annual Report 2015 \n \n \...
1,ABB_02.txt,8,ABB Group Annual Report 2002\n \n Financial re...
2,ABB_03.txt,8,Important information regarding the ABB 2003 A...
3,ABB_04.txt,8,ABB Annual Report 2004 \n \n Financial review ...
4,ABB_05.pdf.txt,9,i\n \n D\n e\n s\n g\n n\n e\n d\n \n \n \n b...
...,...,...,...
1569,WISeKey_2017.pdf.txt,9,ANNUAL REPORT 2017\n \n O I N \n \n - DIGI...
1570,WiSeKey-Annual-Report-2018.pdf.txt,9,Annual Report 2018\n \n ARCHITECTING A WISER W...
1571,ZurRoseGroup-2017-EN-Vollbericht.pdf.txt,9,Annual Report 2017\n \n Zur Rose Group \n \n ...
1572,ZurRoseGroup-2018-EN-Annual-Report.pdf.txt,9,Annual Report 2018\n \n Zur Rose Group \n \n ...


In [5]:
#Group CGMech by categories: 'low', 'middle', 'high'
def categorize(value):
    value=int(value)
    if value >= 1 and value <= 5:
        return 'low'
    elif value >= 6 and value <= 8:
        return 'middle'
    elif value >= 9 and value <= 10:
        return 'high'
    else:
        return 'unknown'

# apply function to create new "cat" column
df['cat'] = df['CGMech'].apply(categorize)

#split dataset: 80%train, 20% test
X = df.content
ylabels=df.CGMech
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=1232)


In [6]:
#visualize categories
cat_counts = df['cat'].value_counts()

print(cat_counts)
cat_percentages = cat_counts / len(df) * 100
print('Total: ', len(df))
print()
cat_percentages=cat_percentages.round(0)

print("Percentage:")
print(cat_percentages)

#Baserate
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(None, y_train)
baserate = dummy.score(None, y_test)
print("Baserate: ")
print(baserate)

high      817
middle    619
low       138
Name: cat, dtype: int64
Total:  1574

Percentage:
high      52.0
middle    39.0
low        9.0
Name: cat, dtype: float64
Baserate: 
0.23809523809523808


In [7]:
import spacy
from sklearn.base import BaseEstimator, TransformerMixin
import joblib


 # 2h11


 
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.processed_data = None
        self.sp = None

    def fit(self, X, y=None):
        self.sp = spacy.load('en_core_web_sm')
        self.sp.max_length = 6000000
        return self

    def transform(self, X):
        processed_data = []
        for text in X:
            doc = self.sp(text)
            tokens = []
            for token in doc:
                if not token.is_stop and token.is_alpha and not token.is_digit and not token.is_punct:
                    cleaned_token = token.lemma_.lower().strip()
                    if cleaned_token:
                        tokens.append(cleaned_token)
            processed_data.append(tokens)
        
        self.processed_data = processed_data
        return self.processed_data

    def save_processed_data(self, filename):
        joblib.dump(self.processed_data, filename)


# Step 1: Preprocess the text in X_train to extract tokens and include spaCy entities
preprocessor = TextPreprocessor()
X_train_processed = preprocessor.fit_transform(X_train)

# Save the preprocessed data to a file
preprocessor.save_processed_data("preprocessed_data.pkl")


In [8]:
pipe = Pipeline([
    ('preprocessor', preprocessor),  # Reuse the preprocessor
    ('vectorizer', TfidfVectorizer(tokenizer=lambda tokens: [token.text for token in tokens])),
    ('classifier', LinearSVC())
])

In [16]:
import pickle
from collections import Counter
import spacy

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

# Open the preprocessed_Data.pkl file
with open("/datasets/deepnote/CGMech/preprocessed_data.pkl", 'rb') as file:
    preprocessed_data = pickle.load(file)

# Create a dictionary to store the top 20 most frequent tokens for each entity type
top_tokens_by_entity_type = {}

# Count the frequency of each token for each entity type
for tokens in preprocessed_data:
    # Join the tokens back into a string
    text = ' '.join(tokens[1:])
    
    # Process the text with spaCy if it's not empty
    if text:
        doc = nlp(text)
        
        # Check if the document contains tokens
        if len(doc) > 0:
            # Extract the entity type from the first token
            entity_type = doc[0].ent_type_
            
            # Count the frequency of each token (excluding the entity type)
            token_counts = Counter([token.text for token in doc[1:] if not token.is_stop and token.is_alpha])
            
            # Get the top 20 most frequent tokens
            top_tokens = token_counts.most_common(20)
            
            # Add the list of top tokens to the dictionary
            top_tokens_by_entity_type[entity_type] = [token for token, count in top_tokens]

# Print the results
for entity_type, top_tokens in top_tokens_by_entity_type.items():
    print(f"Entity Type: {entity_type}")
    print("Top 20 most frequent tokens:")
    for token in top_tokens:
        print(token)
    print()


Entity Type: 
Top 20 most frequent tokens:
c
synthes
e
o
b
group
r
financial
f
share
asset
year
n
board
l
december
million
report
value
d

Entity Type: ORG
Top 20 most frequent tokens:
dormakaba
financial
year
group
share
chf
board
committee
member
compensation
report
directors
statement
holding
ag
executive
asset
end
audit
company

Entity Type: DATE
Top 20 most frequent tokens:
dottikon
es
group
ag
year
net
annual
condense
management
report
chf
development
income
head
board
e
share
member
balance
chemical

Entity Type: PERSON
Top 20 most frequent tokens:
group
financial
chf
gavazzi
carlo
share
asset
income
year
statement
total
board
net
consolidated
cost
march
liability
company
cash
value

Entity Type: GPE
Top 20 most frequent tokens:
implenia
chf
year
board
group
financial
construction
share
business
directors
statement
company
million
total
asset
member
project
work
remuneration
switzerland



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=961bc5f7-68db-4917-95e0-61c59b88476b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>