# Required Libraries

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity

from pprint import pprint
import numpy as np

# Dataset
Load the dataset and print the classes

In [5]:
# Load the Reuters dataset
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))

classes = list(newsgroups_train.target_names)
pprint(classes)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


Let's see the first example. It's a letter form.

In [6]:
print("SHAPE ", newsgroups_train.filenames.shape)
print("----------------------------------------")
print(newsgroups_train.data[0])
print("----------------------------------------")
print(newsgroups_train.target[0], "=>", classes[newsgroups_train.target[0]])

SHAPE  (11314,)
----------------------------------------
I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.
----------------------------------------
7 => rec.autos


# Preprocessing

Preprocess every example: remove stopwords and compute the Tf-idf vector representation.

In [7]:
# Extract features from the dataset
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)
y_train = newsgroups_train.target
y_test = newsgroups_test.target

Let's see what's inside after the transformation

In [8]:
print(X_train[0])
print(X_train.shape)
print(len(newsgroups_train.data), len(vectorizer.vocabulary_))

  (0, 59071)	0.10043853867312116
  (0, 57250)	0.1063473585616558
  (0, 41874)	0.224548896412017
  (0, 49800)	0.11869932893481257
  (0, 46690)	0.12504220873599214
  (0, 73174)	0.16142029533900565
  (0, 99608)	0.09418459052541318
  (0, 84050)	0.16329311028814825
  (0, 37208)	0.1434127293323407
  (0, 62594)	0.13037295035007848
  (0, 87913)	0.25808578247347563
  (0, 54493)	0.06961997844491917
  (0, 23430)	0.12937103288512333
  (0, 77676)	0.12197186951739486
  (0, 81450)	0.1461308934288897
  (0, 24583)	0.19644480500804062
  (0, 16806)	0.1407774554706102
  (0, 83208)	0.11339406589538423
  (0, 76269)	0.08978258481915573
  (0, 34742)	0.17300821242559045
  (0, 24108)	0.24723134514216435
  (0, 25437)	0.10548299054214269
  (0, 11174)	0.20599311323287353
  (0, 35902)	0.1266709604197344
  (0, 9843)	0.20797700857530224
  (0, 55606)	0.13822596989753821
  (0, 57247)	0.1352084247105906
  (0, 84312)	0.16368392505928514
  (0, 34741)	0.14847880131844235
  (0, 31927)	0.10526008886822914
  (0, 80420)	0.1270

Each dimension corresponds to a word in the original text. For example:

In [9]:
print(vectorizer.get_feature_names_out()[25717], X_train[0, 25717])
print(vectorizer.get_feature_names_out()[80420], X_train[0, 80420])

car 0.46579831435138974
saw 0.127069039671221


# Rocchio Model

Definition of the model computation formula. Definition of the test method.

beta e gamma =1 significa che hanno le stesse righe.

In [10]:
# Define the Rocchio Algorithm function
def rocchio_model(X_train, y_train, beta=1, gamma=1):

  # Calculate the centroid for each class
  centroids = np.zeros((len(np.unique(y_train)), X_train.shape[1]))
  for i, label in enumerate(np.unique(y_train)):
    centroids[i] = X_train[y_train == label].mean(axis=0)

  # Calculate the prototype for each class
  prototypes = np.zeros((len(np.unique(y_train)), X_train.shape[1]))
  for i, label in enumerate(np.unique(y_train)):
    prototypes[i] = beta * (X_train[y_train == label].sum(axis=0) - centroids[i]) / X_train[y_train == label].getnnz() \
                    - gamma * X_train[y_train != label].sum(axis=0) / X_train[y_train != label].getnnz()

  return prototypes

def test(rocchio_model, X_test):
  # Make predictions on the test set
  y_pred = []
  for doc in X_test:
    similarities = np.zeros(len(rocchio_model))
    for i, prototype in enumerate(rocchio_model):
      similarities[i] = cosine_similarity(doc.toarray().flatten().reshape(1, -1), prototype.flatten().reshape(1, -1))

    # store only the value of the class maximized by the similarity
    y_pred.append(np.argmax(similarities))

  return y_pred

## Test the model

Test the model with default parameters. Notice that for sake of brevity, the test set is limited to the first 500 (`X_test[:500]`) instances. You will need to evaluate the models on the entire test set.

In [11]:
# get the model
rocchio_prototypes = rocchio_model(X_train, y_train)
print(rocchio_prototypes.shape)

# Make predictions on the test set
y_pred = test(rocchio_prototypes, X_test[:500])

# Print the classification report and confusion matrix
print(classification_report(y_test[:500], y_pred, target_names=newsgroups_test.target_names))

(20, 101322)
                          precision    recall  f1-score   support

             alt.atheism       0.22      0.48      0.30        21
           comp.graphics       0.44      0.76      0.56        21
 comp.os.ms-windows.misc       0.60      0.35      0.44        26
comp.sys.ibm.pc.hardware       0.46      0.56      0.51        34
   comp.sys.mac.hardware       0.68      0.74      0.70        34
          comp.windows.x       0.79      0.73      0.76        26
            misc.forsale       0.72      0.59      0.65        22
               rec.autos       0.72      0.82      0.77        28
         rec.motorcycles       0.76      0.76      0.76        33
      rec.sport.baseball       0.82      0.92      0.87        25
        rec.sport.hockey       0.96      0.85      0.90        27
               sci.crypt       0.79      0.55      0.65        20
         sci.electronics       0.44      0.67      0.53        24
                 sci.med       0.79      0.48      0.59       

Test the model with `beta=1, gamma=10`. The negative examples are more important.

In [12]:
# get the model
prototypes = rocchio_model(X_train, y_train, beta=1, gamma=10)

# Make predictions on the test set
y_pred = test(prototypes, X_test[:500])

# Print the classification report and confusion matrix
print(classification_report(y_test[:500], y_pred, target_names=newsgroups_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.17      0.29      0.21        21
           comp.graphics       0.57      0.76      0.65        21
 comp.os.ms-windows.misc       0.67      0.46      0.55        26
comp.sys.ibm.pc.hardware       0.49      0.62      0.55        34
   comp.sys.mac.hardware       0.60      0.74      0.66        34
          comp.windows.x       0.79      0.73      0.76        26
            misc.forsale       0.56      0.64      0.60        22
               rec.autos       0.73      0.79      0.76        28
         rec.motorcycles       0.76      0.79      0.78        33
      rec.sport.baseball       0.81      0.88      0.85        25
        rec.sport.hockey       0.86      0.93      0.89        27
               sci.crypt       0.76      0.65      0.70        20
         sci.electronics       0.71      0.42      0.53        24
                 sci.med       0.80      0.70      0.74        23
         

Test the model with `beta=10, gamma=1`. Positive examples are more important here.

In [None]:
# get the model
prototypes = rocchio_model(X_train, y_train, beta=10, gamma=1)

# Make predictions on the test set
y_pred = test(prototypes, X_test[:500])

# Print the classification report and confusion matrix
print(classification_report(y_test[:500], y_pred, target_names=newsgroups_test.target_names))

## EXERCISE: Find the best parameters

Now it's your turn. Find the best parameters `beta` and `gamma` in order to maximize somehow the overall F1-measure.  

**Hint**: use a *grid search* by defining 2 `for` loops and save the values of the params; then plot the result in a graphic, discuss it brefly and write some conclusions.

1. Run the Rocchio model over a set of different parameter pair (beta, gamma)
2. Plot as shown in the previous practice lesso the F1 measure achieved with the diffferent settings (useplt or seaborn)
3. Try to modify the code (Rocchio model) to select class specific pairs (beta_i, gamma_i) for each individual i-th class

In [None]:
# fill the lists with plausible values
betas_list = [0.1, 5]
gammas_list = [0.1, 5]

# define a variable to store the evaluations, like a list of lists or a map
f1s = []
for beta in betas_list:
    for gamma in gammas_list:
        rocchio_prototypes = rocchio_model(X_train, y_train, beta=beta, gamma=gamma)
        # predict on all the test set
        y_pred = test(prototypes, X_test)
        # compute here the f1 measure, then store it into the f1s list
        # WARNING: you need a way to remember which were the values of beta and gamma
        # and the F1 in order to make the plot 
        # Print the classification report and confusion matrix
        print(classification_report(y_test[:500], y_pred, target_names=newsgroups_test.target_names
    

In [12]:
# here make a plot as shown in the previous practice lesson with plt or seaborn.