In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the train and test datasets
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

# Prepare the data
X_train = train_data['tweet']
y_train = train_data['label']
X_test = test_data['tweet']

# Text vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)

# Since there are no labels for the test set provided, we tested on the train set for demonstration
y_pred = model.predict(X_train_vectorized)

# Evaluate the model
accuracy = accuracy_score(y_train, y_pred)
report = classification_report(y_train, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.9567611538702209
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     29720
           1       0.95      0.40      0.57      2242

    accuracy                           0.96     31962
   macro avg       0.96      0.70      0.77     31962
weighted avg       0.96      0.96      0.95     31962

