In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

In [2]:
df = pd.read_csv("C:\\Users\\guded\\OneDrive\\Desktop\\INFOSYS\\train-balanced-sarcasm.csv")

In [3]:
# Helper function to clean and convert embeddings from strings to numpy arrays
def clean_and_convert_embedding(embedding_str):
    try:
        # Ensure the string is properly formatted with commas between numbers
        clean_str = embedding_str.replace('\n', ' ').replace('[ ', '[').replace(' ]', ']').replace('  ', ' ')
        clean_str = ','.join(clean_str.split())  # Ensure commas are placed correctly
        return np.array(ast.literal_eval(clean_str))
    except Exception as e:
        print(f"Error parsing embedding: {embedding_str}")
        raise e

In [4]:
# Apply the helper function to the DataFrame
df['word2vec_embeddings'] = df['word2vec_embeddings'].apply(clean_and_convert_embedding)

In [5]:
# Verify the conversion
print(df['word2vec_embeddings'].head())

0    [-0.249028446, -0.112276957, -0.00595155568, 0...
1    [-0.06435088, -0.14805339, 0.32883312, 0.62132...
2    [0.1968625, 0.05365723, 0.03638186, 0.10795132...
3    [-0.14819673, 0.18820012, 0.08938915, 0.426148...
4    [0.21537142, 0.49409585, 1.07777257, 0.8793004...
Name: word2vec_embeddings, dtype: object


In [6]:
# Sample a subset of the dataset
df_sampled = df.sample(n=10000, random_state=42)  # Adjust the sample size as needed

In [7]:
# Separate features and target
X = np.vstack(df_sampled['word2vec_embeddings'].values)
y = df_sampled['label'].values

In [8]:
# Reduce the dimensionality of the embeddings using PCA
pca = PCA(n_components=20)  # Further reduce the number of components
X_reduced = pca.fit_transform(X)

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

In [10]:
# Initialize the Logistic Regression classifier
log_reg_classifier = LogisticRegression(max_iter=1000, random_state=42)

In [11]:
# Measure the time taken to train the Logistic Regression classifier
start_time = time.time()
log_reg_classifier.fit(X_train, y_train)
end_time = time.time()

In [12]:
# Calculate training time
training_time = end_time - start_time
print(f"Training time: {training_time} seconds")

Training time: 0.023777484893798828 seconds


In [13]:
# Predict on the test set
y_pred = log_reg_classifier.predict(X_test)

In [14]:
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

In [15]:
print("Accuracy:", accuracy)

Accuracy: 0.6165


In [16]:
print("\nClassification Report:\n", classification_rep)


Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.82      0.71      1127
           1       0.60      0.35      0.45       873

    accuracy                           0.62      2000
   macro avg       0.61      0.59      0.58      2000
weighted avg       0.61      0.62      0.59      2000



In [17]:
print("\nConfusion Matrix:\n", confusion_mat)


Confusion Matrix:
 [[925 202]
 [565 308]]
