In [18]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

try:
    marks = json.load(open("s:/Projects/bookmark_processor/data/bookmarks.db.json", encoding='utf-8'))
    marks = pd.DataFrame(marks)
    marks.set_index(['key'], inplace=True)
except json.decoder.JSONDecodeError as e:
    # need to troubleshoot the error, adjust data or code
    print(f"Failed to load existing bookmarks file: {e}")
    exit(-1)

### Split the data into train and testing sets

In [19]:
marks = marks.loc[marks['category'].isin(['A', 'F', 'E', 'O'])]
marks = marks.dropna()
marks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1848 entries, 3e203ea3cbc1f9f5d0202ef4e1343c32bfd441ba335900f27a26cb4457b801ec to b6a3c64e90531028090c29583c6c9d9b66c1fef2d29eb40500a8c28b45401af8
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   bmkUri        1848 non-null   object 
 1   title         1848 non-null   object 
 2   dateAdded     1848 non-null   float64
 3   siteTitle     1848 non-null   object 
 4   siteIcon      1848 non-null   object 
 5   category      1848 non-null   object 
 6   domain        1848 non-null   object 
 7   articleTitle  1848 non-null   object 
dtypes: float64(1), object(7)
memory usage: 129.9+ KB


In [20]:
marks['category'].value_counts()

category
A    1014
F     451
O     212
E     171
Name: count, dtype: int64

In [21]:
keys = marks.keys().drop('category')
print(keys.values)

X = marks.get(keys.values)
Y = marks.get('category')

X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=42) 

print(X_train.shape)
print()
print(X_test.shape)
print()
print(y_train.shape)
print()
print(y_test.shape)
print()



['bmkUri' 'title' 'dateAdded' 'siteTitle' 'siteIcon' 'domain'
 'articleTitle']
(1386, 7)

(462, 7)

(1386,)

(462,)



In [22]:


from sklearn.base import BaseEstimator, TransformerMixin


class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.column_name]
    
    

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion

pipeline = Pipeline([
    ('union', FeatureUnion([
        ('web_page_title', Pipeline([
            ('selector', ColumnSelector('title')),
            ('tfidf', TfidfVectorizer())
        ])),
        ('url', Pipeline([
            ('selector', ColumnSelector('bmkUri')),
            ('tfidf', TfidfVectorizer())
        ])),
        ('web_site_title', Pipeline([
            ('selector', ColumnSelector('siteTitle')),
            ('tfidf', TfidfVectorizer())
        ])),
        ('domain', Pipeline([
            ('selector', ColumnSelector('domain')),
            ('tfidf', TfidfVectorizer())
        ]))
        
    ])),
    ('clf', LogisticRegression())
])

In [24]:
pipeline.fit(X_train, y_train)

In [25]:
from sklearn.metrics import classification_report

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           A       0.86      0.93      0.89       269
           E       0.92      0.59      0.72        37
           F       0.80      0.85      0.83       109
           O       0.76      0.53      0.62        47

    accuracy                           0.84       462
   macro avg       0.83      0.73      0.77       462
weighted avg       0.84      0.84      0.84       462

