#Preparing Data:

Import relevant packages

In [None]:
import pandas as pd # Data handling
import numpy as np

In [None]:
# Mount drive (if data is kept on Google Drive)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#Import data frame including all texts and their previously extracted embeddings:
df = pd.read_parquet("/content/drive/MyDrive/STAR Research/all_embeddings.parquet")

KeyboardInterrupt: 

#Logistic Regression Model with sklearn

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold

In [None]:
# For turning categorical variables into a vector
le = LabelEncoder()

skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Initialises the logistic regression
model = LogisticRegression(solver='lbfgs')

# To standardise embeddings (mean=0, std=1) - with z score
scaler = StandardScaler()

In [None]:
# Select specfic corpus
dataset = df[df['corpus'] == 'Reddit']

n = 400# Define the numbers of authors
all_authors = dataset['author'].unique() # Get list of all authors

#pd.Series turns the list into a table and then we sample from it
random_sample = pd.Series(all_authors).sample(n)
df_subset = dataset[dataset['author'].isin(random_sample)] # Select the sampled authors

# Unravel the embeddings in the 'embedding' column into a matrix
X = np.stack(df_subset['embedding'])

# fit assigns a unique integer to each author
# transform actually converts the strings to a NumPy array with those integers
y = le.fit_transform(df_subset['author'])

Weights for embeddings 105 and 921 (out of the 1024 dimensions) are the most frequently occuring in a count of the highest magnitude weight across texts

In [None]:
#reddit 105:
reddit_105= pd.read_parquet("/content/drive/MyDrive/STAR Research/Reddit 2 author tests/reddit_105.parquet")
# Unravel the embeddings in the 'embedding' column into a matrix
X = np.stack(reddit_105['embedding'])

# fit assigns a unique integer to each author
# transform actually converts the strings to a NumPy array with those integers
y = le.fit_transform(reddit_105['author'])

In [None]:
#reddit 921
reddit_921=pd.read_parquet("/content/drive/MyDrive/STAR Research/Reddit 2 author tests/reddit_921.parquet")# Unravel the embeddings in the 'embedding' column into a matrix
X = np.stack(reddit_921['embedding'])

# fit assigns a unique integer to each author
# transform actually converts the strings to a NumPy array with those integers
y = le.fit_transform(reddit_921['author'])

In [None]:
#step_accuracy=[]
#steps=100
#for _ in range(steps):

all_folds_preds = []
all_folds_true = []

for train_idx, val_idx in skf.split(X, y):
    X_train, y_train = X[train_idx], y[train_idx]
    X_val, y_val = X[val_idx], y[val_idx]

    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    model = LogisticRegression(solver='lbfgs', max_iter=3000)
    model.fit(X_train, y_train)

    val_preds=model.predict(X_val)
    all_folds_preds.extend(val_preds)
    all_folds_true.extend(y_val)
  #step_accuracy.append(accuracy_score(all_folds_true, all_folds_preds))
#mean_accuracy=np.mean(step_accuracy)
#print("n:", n, "step accuracy:", step_accuracy)
print("accuracy:", accuracy_score(all_folds_true, all_folds_preds))

accuracy: 1.0


Interpreting weights:

In [None]:
#weights = pd. DataFrame(model.coef_)
abs_weights=abs(model.coef_)#here rows are authors, columns are dimensions
#weights.to_parquet(f'/content/drive/MyDrive/STAR Research/reddit_author_weights_scaled.parquet')
#abs_weights.to_parquet(f'/content/drive/MyDrive/STAR Research/reddit_abs_weights_scaled.parquet')

In [None]:
print(abs_weights.shape)

(1, 1024)


In [None]:
print(abs_weights)

[[0.01034487 0.00439147 0.00151255 ... 0.01406698 0.00473588 0.00112861]]


In [None]:
#abs_weights.T - rows are dimensions, columns are authors

In [None]:
abs_weights.shape

(1, 1024)

In [None]:
print(abs_weights[0])
#all dimensions for 1st author

[0.01034487 0.00439147 0.00151255 ... 0.01406698 0.00473588 0.00112861]


#Highest magnitude weight across 1024 dimensions:

In [None]:
#iterate over rows in weights (or columns in weights.T since easier to index)
#and determine largest magnitude in each weight vector
#argmax outputs the index of the highest weight for each author
max_weights=(np.argmax(abs_weights, axis=-1, keepdims=True))
print(max_weights.shape)

(1, 1)


In [None]:
print(max_weights)

[[361]]


In [None]:
print(np.max(abs_weights[0]))
print(np.argmax(abs_weights[0]))

0.017866942386391676
361


In [None]:
#see the most frequently occuring values:
unique_max, unique_indices, counts=np.unique(max_weights, return_index=True, return_counts=True)
np.argsort(counts, stable=True)

#index 21 and 215 of max weights = 6 frequency

array([0])

In [None]:
np.sort(counts)

array([1])

In [None]:
np.where(counts==6)

(array([], dtype=int64),)

In [None]:
print(unique_max[21], unique_max[215]) #weights 21 and 215 most frequently occuring across 400 authors

IndexError: index 21 is out of bounds for axis 0 with size 1

In [None]:
print(abs_weights.T[105])

In [None]:
print(abs_weights.T[921])

In [None]:
print(np.max(abs_weights.T[105]))
print(np.argmax(abs_weights.T[105]))
#author in index 378 has the highest value of 105 weight

In [None]:
print(np.max(abs_weights.T[921]))
print(np.argmax(abs_weights.T[921]))
#author in index 214 has the highest value of 921 weight

In [None]:
print(np.min(abs_weights.T[105]))
print(np.argmin(abs_weights.T[105]))
#author in index 275 has the lowest value of 105 weight

In [None]:
print(np.min(abs_weights.T[921]))
print(np.argmin(abs_weights.T[921]))
#author in index 180 has the lowest value of 921 weight

#Knn Classifier:


In [None]:
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.metrics import accuracy_score, classification_report - #already imported above

knn=KNeighborsClassifier(
    n_neighbors=3,
    metric='cosine'
)

knn.fit(X_train, y_train)

predictions=knn.predict(X_val)
accuracy=accuracy_score(y_val, predictions)


In [None]:
print(predictions)

In [None]:
print(accuracy)