# Baseline Model for Question 1 (data from 2013, 2018, 2020)
## Dummy Classifier model

### Dependencies

In [1]:
#load dependencies
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import recall_score, precision_score

In [2]:
#load in preprocessed train and validation datas
#your path here

X_train_Q1 = pd.read_csv('data/X_train_pp.csv')
X_valid_Q1 = pd.read_csv('data/X_valid_pp.csv')

y_train_Q1 = pd.read_csv('data/y_train.csv')
y_valid_Q1 = pd.read_csv('data/y_valid.csv')

### Tfid Vectorizer Representation

In [3]:
#Tfid Vectorizer Representation

def tfid_vectorizer(train, valid):
    tfid = TfidfVectorizer() 
    X = tfid.fit_transform(train)
    X_valid = tfid.transform(valid)
    return X, X_valid

### All labels (themes & sub-themes)

In [4]:
#Vectorize X_train and convert Y_train to an array

X_train, X_valid = tfid_vectorizer(X_train_Q1['Comment'].values.astype('U'), 
                                    X_valid_Q1['Comment'].values.astype('U')) #had to convert type 
Y_train = (np.array(y_train_Q1))

In [6]:
# Dummy model
dummy_clf = DummyClassifier()
dummy_clf.fit(X_train, Y_train)
dummy_clf.predict(X_train)

# Scores ALL LABELS
print("Training Score for Dummy Classifier:", 
      dummy_clf.score(X_train, Y_train))
print("Validation Score for Dummy Classifier:",
     dummy_clf.score(X_valid, np.array(y_valid_Q1)))

y_pred = dummy_clf.predict(X_valid)
print("Validation Recall for Dummy Classifier:",
     recall_score(y_pred, np.array(y_valid_Q1), average= 'micro'))
print("Validation Precision for Dummy Classifier:",
     precision_score(y_pred, np.array(y_valid_Q1), average= 'micro'))

Training Score for Dummy Classifier: 9.637625289128758e-05
Validation Score for Dummy Classifier: 0.00038550501156515033
Validation Recall for Dummy Classifier: 0.08258928571428571
Validation Precision for Dummy Classifier: 0.0821363280229825


In [7]:
# the prediction is zero for all values
print("shape:", y_pred.shape)
print("\nnumber of predictions per label:")
np.sum(y_pred, axis=0)

shape: (2594, 75)

number of predictions per label:


array([323., 323., 224., 361., 221., 263., 208., 249., 410., 569., 360.,
       109.,  21.,  14., 207.,  70.,  19.,  36.,  61.,  49., 170.,  43.,
        30.,  20.,  38.,  64.,  94.,  31.,  16., 190.,  48.,  95.,  32.,
        68.,  83.,  58.,   8.,  98., 128.,  35.,  34.,  38.,  75.,  12.,
        63.,  15.,  28.,  49., 120.,  61.,  20., 145.,  72., 123.,  63.,
        31.,  47., 298.,  70., 119.,  61.,  62.,  62.,  99.,  65.,  41.,
        55.,  84.,   4.,  27.,  64.,  49.,  14.,   0.,   0.])

### Themes

In [8]:
# target THEMES
Y_train_themes = np.concatenate((np.array(y_train_Q1)[:,:12], np.array(y_train_Q1)[:,-1:]), axis=1)
Y_valid_themes = np.concatenate((np.array(y_valid_Q1)[:,:12], np.array(y_valid_Q1)[:,-1:]), axis=1)
Y_train_themes.shape

(10376, 13)

In [9]:
# Dummy model
dummy_clf_themes = DummyClassifier()
dummy_clf_themes.fit(X_train, Y_train_themes)
dummy_clf_themes.predict(X_train)

# Scores THEMES
print("Training Score for Dummy Classifier:", 
      dummy_clf_themes.score(X_train, Y_train_themes))
print("Validation Score for Dummy Classifier:",
     dummy_clf_themes.score(X_valid, Y_valid_themes))

y_pred_themes = dummy_clf_themes.predict(X_valid)
print("Validation Recall for Dummy Classifier:",
      recall_score(y_pred_themes, Y_valid_themes, average= 'micro'))
print("Validation Precision for Dummy Classifier:",
     precision_score(y_pred_themes, Y_valid_themes, average= 'micro'))

Training Score for Dummy Classifier: 0.028527370855821126
Validation Score for Dummy Classifier: 0.026599845797995375
Validation Recall for Dummy Classifier: 0.12954672635702294
Validation Precision for Dummy Classifier: 0.12797125483692648


### Sub-themes

In [10]:
# target SUB-THEMES
Y_train_sub = (np.array(y_train_Q1)[:,12:])
Y_valid_sub = (np.array(y_valid_Q1)[:,12:])
Y_train_sub.shape

(10376, 63)

In [11]:
# Dummy model
dummy_clf_sub = DummyClassifier()
dummy_clf_sub.fit(X_train, Y_train_sub)
dummy_clf_sub.predict(X_train)

# Scores THEMES
print("Training Score for Dummy Classifier:", 
      dummy_clf_sub.score(X_train, Y_train_sub))
print("Validation Score for Dummy Classifier:",
     dummy_clf_sub.score(X_valid, Y_valid_sub))

y_pred_sub = dummy_clf_sub.predict(X_valid)
print("Validation Recall for Dummy Classifier:",
      recall_score(y_pred_sub, Y_valid_sub, average= 'micro'))
print("Validation Precision for Dummy Classifier:",
     precision_score(y_pred_sub, Y_valid_sub, average= 'micro'))

Training Score for Dummy Classifier: 0.0047224363916730915
Validation Score for Dummy Classifier: 0.007324595219737857
Validation Recall for Dummy Classifier: 0.037163814180929094
Validation Precision for Dummy Classifier: 0.03762376237623762
