In [2]:
import sys
sys.path.append('../utilities/')
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from sentence_transformers import SentenceTransformer
from joblib import dump
from openai import OpenAI
from tqdm import tqdm
from mmd import MMD
import re
from sklearn.feature_extraction.text import CountVectorizer




# **Sentence Transformer**

In [4]:
sentence_transformer = SentenceTransformer('all-mpnet-base-v2')

# **Data Pre-processing**

In [3]:
train_df = pd.read_csv('../data/initial_datasets/speeches/speeches_train.csv')
test_df = pd.read_csv('../data/initial_datasets/speeches/speeches_test.csv')

In [20]:
df = pd.read_csv('../data/initial_datasets/speeches/speeches.csv')

In [14]:
train_df, test_df = train_test_split(df, test_size=0.5)

In [5]:
# Rename 'label' to 'affiliation' and create new 'label' column
for df_ in [train_df, test_df]:
    df_.rename(columns={'label': 'affiliation'}, inplace=True)
    df_['label'] = df_['affiliation'].map({'Republican': -1, 'Democrat': 1})

In [6]:
# Drop the 'affiliation' column from both train and test dataframes
train_df = train_df.drop(columns=['affiliation'])
test_df = test_df.drop(columns=['affiliation'])
train_df.head(), test_df.head()

(                                                text  label
 0  Thank you very much. Appreciate it. Thank you ...     -1
 1  I LOVE GEORGIA. I LOVE BEING WITH YOU. THIS IS...     -1
 2  BEAUTIFUL. WOULD A CROWD THIS ONE IS. YOU HAVE...     -1
 3  Well, hello, Iowa. (Applause.) Thank you. Than...     -1
 4  WILL THANK YOU VICE PRESIDENT PANS, THANK YOU ...     -1,
                                                 text  label
 0  On this holiest of Jewish religious observance...     -1
 1  THANK YOU. YOU HAVE NO IDEA HOW GREAT IT IS TO...      1
 2  IT LOOKS LIKE IT WILL BE A LONG NIGHT, BUT I F...      1
 3  So we're going to Wisconsin. We have a big cro...     -1
 4  Well, thank you, Jim Bridenstine. Thank you fo...     -1)

In [8]:
test_df

Unnamed: 0,text,label
0,On this holiest of Jewish religious observance...,-1
1,THANK YOU. YOU HAVE NO IDEA HOW GREAT IT IS TO...,1
2,"IT LOOKS LIKE IT WILL BE A LONG NIGHT, BUT I F...",1
3,So we're going to Wisconsin. We have a big cro...,-1
4,"Well, thank you, Jim Bridenstine. Thank you fo...",-1
...,...,...
247,Thank you all for coming out on this blustery ...,-1
248,"Well, hello, Kentucky! (Applause.) Thank you f...",-1
249,HELLO FOLKS HOW ARE YOU? THANK YOU. THANK YOU....,1
250,"Thank you, Secretary Chao. Thank you for those...",-1


In [7]:
train_df

Unnamed: 0,text,label
0,Thank you very much. Appreciate it. Thank you ...,-1
1,I LOVE GEORGIA. I LOVE BEING WITH YOU. THIS IS...,-1
2,BEAUTIFUL. WOULD A CROWD THIS ONE IS. YOU HAVE...,-1
3,"Well, hello, Iowa. (Applause.) Thank you. Than...",-1
4,"WILL THANK YOU VICE PRESIDENT PANS, THANK YOU ...",-1
...,...,...
246,"MR. PRESIDENT, THANK YOU, THANK YOU, THANK YOU...",1
247,"THANK YOU VERY MUCH . THANK YOU, LET'S SEE . I...",-1
248,YOU JUST GOT TO OF THE GREATEST TRADE DEALS. A...,-1
249,"Well, Chip, thank you very much. And, my fello...",-1


In [9]:
X_test = np.array(sentence_transformer.encode(test_df['text'].to_list()))

In [10]:
y_test = test_df['label']

In [50]:
X_test.shape

(252, 768)

In [51]:
y_test

0     -1
1      1
2      1
3     -1
4     -1
      ..
247   -1
248   -1
249    1
250   -1
251   -1
Name: label, Length: 252, dtype: int64

In [28]:
model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

Train acc: 0.9482071713147411
Test acc: 0.8928571428571429


In [43]:
test_df

Unnamed: 0,text,affiliation,label
0,On this holiest of Jewish religious observance...,Republican,-1
1,THANK YOU. YOU HAVE NO IDEA HOW GREAT IT IS TO...,Democrat,1
2,"IT LOOKS LIKE IT WILL BE A LONG NIGHT, BUT I F...",Democrat,1
3,So we're going to Wisconsin. We have a big cro...,Republican,-1
4,"Well, thank you, Jim Bridenstine. Thank you fo...",Republican,-1
...,...,...,...
247,Thank you all for coming out on this blustery ...,Republican,-1
248,"Well, hello, Kentucky! (Applause.) Thank you f...",Republican,-1
249,HELLO FOLKS HOW ARE YOU? THANK YOU. THANK YOU....,Democrat,1
250,"Thank you, Secretary Chao. Thank you for those...",Republican,-1


In [29]:
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

0.8541666666666666
0.8631578947368421


In [32]:
y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

0.9516258799865907


# **Active Learning**

In [11]:
nums = [10, 50, 100, 200]

In [12]:
temp_df = train_df

In [55]:
train_df = temp_df.sample(n=nums[0]) 

X_train = np.array(sentence_transformer.encode(train_df['text'].to_list()))
y_train = train_df['label']

model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_train)
train_acc = accuracy_score(y_pred, y_train)

print(n)
print(f'Train acc: {train_acc}')

y_pred = model.predict(X_test)
test_acc = accuracy_score(y_pred, y_test)
print(f'Test acc: {test_acc}')

print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

y_prob = model.predict_proba(X_test)[:, 1]
print(roc_auc_score(y_test, y_prob))

10
Train acc: 1.0


TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=['Democrat' 'Republican'] and y_pred=[-1  1]. Make sure that the predictions provided by the classifier coincides with the true labels.

In [54]:
temp_df = train_df

In [13]:
for n in nums:

    train_df = temp_df.sample(n=n) 

    X_train = np.array(sentence_transformer.encode(train_df['text'].to_list()))
    y_train = train_df['label']

    model = svm.SVC(kernel='linear', probability=True, class_weight='balanced')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_train)
    train_acc = accuracy_score(y_pred, y_train)

    print(n)
    print(f'Train acc: {train_acc}')

    y_pred = model.predict(X_test)
    test_acc = accuracy_score(y_pred, y_test)
    print(f'Test acc: {test_acc}')

    print(precision_score(y_test, y_pred))
    print(recall_score(y_test, y_pred))

    y_prob = model.predict_proba(X_test)[:, 1]
    print(roc_auc_score(y_test, y_prob))


10
Train acc: 1.0
Test acc: 0.6468253968253969
0.6
0.18947368421052632
0.45598390881662754
50
Train acc: 0.96
Test acc: 0.8134920634920635
0.7181818181818181
0.8315789473684211
0.8991619175326853
50
Train acc: 0.96
Test acc: 0.8134920634920635
0.7181818181818181
0.8315789473684211
0.8991619175326853
100
Train acc: 0.97
Test acc: 0.8452380952380952
0.7641509433962265
0.8526315789473684
0.9257794166946027
100
Train acc: 0.97
Test acc: 0.8452380952380952
0.7641509433962265
0.8526315789473684
0.9257794166946027
200
Train acc: 0.96
Test acc: 0.8849206349206349
0.8367346938775511
0.8631578947368421
0.9437479047938317
200
Train acc: 0.96
Test acc: 0.8849206349206349
0.8367346938775511
0.8631578947368421
0.9437479047938317
