# Step 1: Install & Import

In [None]:
!pip install scikit-learn pandas numpy



In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 2: Load Dataset

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (7613, 5)
Test shape: (3263, 4)


# Step 3: Preprocess Tweets

In [None]:
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+", "", text)  # remove links
    text = re.sub(r"@\w+", "", text)  # remove mentions
    text = re.sub(r"#", "", text)  # remove hashtags symbol
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    text = re.sub(r"\d+", "", text)  # remove numbers
    text = text.strip()
    return text

In [None]:
train["clean_text"] = train["text"].apply(clean_text)
test["clean_text"] = test["text"].apply(clean_text)

# Step 4: Split Data

In [None]:
X = train["clean_text"]
y = train["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: TF-IDF Vectorizer

In [None]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(test["clean_text"])

# Step 6: Train Model

In [None]:
model = LogisticRegression(max_iter=200)
model.fit(X_train_tfidf, y_train)

# Step 7: Evaluate

In [None]:
val_preds = model.predict(X_val_tfidf)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))

Validation Accuracy: 0.8069599474720945


# Step 8: Predict on Test

In [None]:
test_preds = model.predict(X_test_tfidf)

# Step 9: Submission File

In [None]:
submission = pd.DataFrame({
    "id": test["id"],
    "target": test_preds
})

submission.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")

Submission file saved as submission.csv


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive
