In [1]:
pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd

def train_linear_regression_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

# Load data
df = pd.read_csv("embedded_dataset_deberta.csv")
df = df.dropna(subset=["label"])

# Use one feature
X = df[["embedding_0"]].values
y = df["label"].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = train_linear_regression_model(X_train, y_train)

# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Output results
print("Train Predictions:", y_train_pred[:5])
print("Test Predictions:", y_test_pred[:5])


Train Predictions: [2.3583618  2.26897099 2.35871357 2.3729983  2.37665259]
Test Predictions: [2.38788876 2.42262009 2.40487585 2.3569433  2.36478988]


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import numpy as np

# Load dataset
df = pd.read_csv("embedded_dataset_deberta.csv")
df = df.dropna(subset=["label"])

# Use one embedding feature (embedding_0)
X = df[["embedding_0"]].values
y = df["label"].values

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAPE:", mape)
print("R² Score:", r2)


MSE: 0.519312349012836
RMSE: 0.7206332971857712
MAPE: 0.36429633067533557
R² Score: -0.002986062652017507


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load dataset
df = pd.read_csv("embedded_dataset_deberta.csv")
df = df.dropna(subset=["label"])

# Select all embedding columns
X = df[[col for col in df.columns if col.startswith("embedding_")]].values
y = df["label"].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)
print("First 5 predictions on test set:", y_pred[:5])


First 5 predictions on test set: [2.52006721 2.67136752 2.43934959 2.84266422 2.9987155 ]


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load and clean dataset
df = pd.read_csv("embedded_dataset_deberta.csv")
df = df.dropna(subset=["label"])

# Binary classification: Ensure labels are 0 or 1
X = df[[col for col in df.columns if col.startswith("embedding_")]].values
y = df["label"].astype(int).values  # Ensure integer labels

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
print("Predictions:", y_pred[:5])


Predictions: [3 3 3 3 3]


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("embedded_dataset_deberta.csv")
df = df.dropna(subset=["label"])

# Features and target
X = df[[col for col in df.columns if col.startswith("embedding_")]].values
y = df["label"].astype(int).values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Classification report
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

           1       0.67      0.02      0.04        92
           2       0.45      0.33      0.38       213
           3       0.59      0.83      0.69       358

    accuracy                           0.56       663
   macro avg       0.57      0.39      0.37       663
weighted avg       0.55      0.56      0.50       663



In [7]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("embedded_dataset_deberta.csv")
df = df.dropna(subset=["label"])

X = df[[col for col in df.columns if col.startswith("embedding_")]].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce dimensions to 2
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print("PCA output shape:", X_pca.shape)
print("First 5 PCA components:\n", X_pca[:5])


PCA output shape: (3311, 2)
First 5 PCA components:
 [[  5.3229746    2.46619942]
 [  5.09184631   4.46762507]
 [ -0.95173992   8.10320349]
 [-12.05146419  -1.09633618]
 [  3.91290967  -3.2813958 ]]


In [8]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

# Load data
df = pd.read_csv("embedded_dataset_deberta.csv")
df = df.dropna(subset=["label"])

X = df[[col for col in df.columns if col.startswith("embedding_")]].values
y = df["label"].astype(int).values

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train classifier
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.00      0.00      0.00        92
           2       0.40      0.01      0.02       213
           3       0.54      0.99      0.70       358

    accuracy                           0.54       663
   macro avg       0.31      0.33      0.24       663
weighted avg       0.42      0.54      0.38       663



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
