# Baseline Model for Question 1 (data from 2013, 2018, 2020)
## Dummy Classifier model

### Dependencies

In [1]:
#load dependencies
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import recall_score, precision_score

In [2]:
#load in preprocessed train and validation datas
#your path here

X_train_Q1 = pd.read_csv('data/X_train_pp.csv')
X_valid_Q1 = pd.read_csv('data/X_valid_pp.csv')

y_train_Q1 = pd.read_csv('data/y_train.csv')
y_valid_Q1 = pd.read_csv('data/y_valid.csv')

### Tfid Vectorizer Representation

In [3]:
#Tfid Vectorizer Representation

def tfid_vectorizer(train, valid):
    tfid = TfidfVectorizer() 
    X = tfid.fit_transform(train)
    X_valid = tfid.transform(valid)
    return X, X_valid

### All labels (themes & sub-themes)

In [4]:
#Vectorize X_train and convert Y_train to an array

X_train, X_valid = tfid_vectorizer(X_train_Q1['Comment'].values.astype('U'), 
                                    X_valid_Q1['Comment'].values.astype('U')) #had to convert type 
Y_train = (np.array(y_train_Q1))

In [5]:
# Dummy model
dummy_clf = DummyClassifier(strategy="stratified")
dummy_clf.fit(X_train, Y_train)
dummy_clf.predict(X_train)

# Scores ALL LABELS
print("Training Score for Dummy Classifier:", 
      dummy_clf.score(X_train, Y_train))
print("Validation Score for Dummy Classifier:",
     dummy_clf.score(X_valid, np.array(y_valid_Q1)))

y_pred = dummy_clf.predict(X_valid)
print("Validation Recall for Dummy Classifier:",
     recall_score(y_pred, np.array(y_valid_Q1), average= 'micro'))
print("Validation Precision for Dummy Classifier:",
     precision_score(y_pred, np.array(y_valid_Q1), average= 'micro'))

Training Score for Dummy Classifier: 0.00019275250578257516
Validation Score for Dummy Classifier: 0.00038550501156515033
Validation Recall for Dummy Classifier: 0.08233117483811286
Validation Precision for Dummy Classifier: 0.08135283363802559


In [6]:
# the prediction is zero for all values
print("shape:", y_pred.shape)
print("\nnumber of predictions per label:")
np.sum(y_pred, axis=0)

shape: (2594, 75)

number of predictions per label:


array([326., 326., 227., 342., 218., 259., 199., 251., 377., 618., 352.,
       120.,  24.,  16., 190.,  76.,  16.,  35.,  52.,  45., 179.,  57.,
        27.,  15.,  34.,  66., 100.,  41.,  14., 176.,  37., 106.,  25.,
        82.,  74.,  36.,   8.,  95., 108.,  21.,  37.,  54.,  82.,  20.,
        76.,  18.,  36.,  43., 115.,  39.,  26., 140.,  79., 139.,  68.,
        35.,  36., 263.,  57., 104.,  58.,  82.,  61., 104.,  78.,  46.,
        55.,  72.,   1.,  33.,  89.,  42.,   8.,   0.,   1.])

### Themes

In [7]:
# target THEMES
Y_train_themes = np.concatenate((np.array(y_train_Q1)[:,:12], np.array(y_train_Q1)[:,-1:]), axis=1)
Y_valid_themes = np.concatenate((np.array(y_valid_Q1)[:,:12], np.array(y_valid_Q1)[:,-1:]), axis=1)
Y_train_themes.shape

(10376, 13)

In [8]:
# Dummy model
dummy_clf_themes = DummyClassifier(strategy="stratified")
dummy_clf_themes.fit(X_train, Y_train_themes)
dummy_clf_themes.predict(X_train)

# Scores THEMES
print("Training Score for Dummy Classifier:", 
      dummy_clf_themes.score(X_train, Y_train_themes))
print("Validation Score for Dummy Classifier:",
     dummy_clf_themes.score(X_valid, Y_valid_themes))

y_pred_themes = dummy_clf_themes.predict(X_valid)
print("Validation Recall for Dummy Classifier:",
      recall_score(y_pred_themes, Y_valid_themes, average= 'micro'))
print("Validation Precision for Dummy Classifier:",
     precision_score(y_pred_themes, Y_valid_themes, average= 'micro'))

Training Score for Dummy Classifier: 0.030454895913646876
Validation Score for Dummy Classifier: 0.026599845797995375
Validation Recall for Dummy Classifier: 0.13574165298303228
Validation Precision for Dummy Classifier: 0.1370923161967938


### Sub-themes

In [9]:
# target SUB-THEMES
Y_train_sub = (np.array(y_train_Q1)[:,12:])
Y_valid_sub = (np.array(y_valid_Q1)[:,12:])
Y_train_sub.shape

(10376, 63)

In [10]:
# Dummy model
dummy_clf_sub = DummyClassifier(strategy="stratified")
dummy_clf_sub.fit(X_train, Y_train_sub)
dummy_clf_sub.predict(X_train)

# Scores THEMES
print("Training Score for Dummy Classifier:", 
      dummy_clf_sub.score(X_train, Y_train_sub))
print("Validation Score for Dummy Classifier:",
     dummy_clf_sub.score(X_valid, Y_valid_sub))

y_pred_sub = dummy_clf_sub.predict(X_valid)
print("Validation Recall for Dummy Classifier:",
      recall_score(y_pred_sub, Y_valid_sub, average= 'micro'))
print("Validation Precision for Dummy Classifier:",
     precision_score(y_pred_sub, Y_valid_sub, average= 'micro'))

Training Score for Dummy Classifier: 0.00481881264456438
Validation Score for Dummy Classifier: 0.0050115651503469544
Validation Recall for Dummy Classifier: 0.04234122042341221
Validation Precision for Dummy Classifier: 0.04207920792079208
