In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Helper Function for Text Cleaning:

Implement a Helper Function as per Text Preprocessing Notebook and Complete the following pipeline.

# Build a Text Cleaning Pipeline

In [None]:
def text_cleaning_pipeline(dataset, rule = "lemmatize"):
  """
  This...
  """
  # Convert the input to small/lower order.
  data =
  # Remove URLs
  data =
  # Remove emojis
  data =
  # Remove all other unwanted characters.
  data =
  # Create tokens.
  tokens = data.split()
  # Remove stopwords:
  tokens =
  if rule == "lemmatize":
    tokens =
  elif rule == "stem":
    tokens =
  else:
    print("Pick between lemmatize or stem")


  return " ".join(tokens)


# Text Classification using Machine Learning Models


### 📝 Instructions: Trump Tweet Sentiment Classification

1. **Load the Dataset**  
   Load the dataset named `"trump_tweet_sentiment_analysis.csv"` using `pandas`. Ensure the dataset contains at least two columns: `"text"` and `"label"`.

2. **Text Cleaning and Tokenization**  
   Apply a text preprocessing pipeline to the `"text"` column. This should include:
   - Lowercasing the text  
   - Removing URLs, mentions, punctuation, and special characters  
   - Removing stopwords  
   - Tokenization (optional: stemming or lemmatization)
   - "Complete the above function"

3. **Train-Test Split**  
   Split the cleaned and tokenized dataset into **training** and **testing** sets using `train_test_split` from `sklearn.model_selection`.

4. **TF-IDF Vectorization**  
   Import and use the `TfidfVectorizer` from `sklearn.feature_extraction.text` to transform the training and testing texts into numerical feature vectors.

5. **Model Training and Evaluation**  
   Import **Logistic Regression** (or any machine learning model of your choice) from `sklearn.linear_model`. Train it on the TF-IDF-embedded training data, then evaluate it using the test set.  
   - Print the **classification report** using `classification_report` from `sklearn.metrics`.


In [2]:
# Step 1: Load The Dataset
# --------------------------------------------
import pandas as pd
import numpy as np

# Load the Trump tweet sentiment dataset
df = pd.read_csv('/content/drive/MyDrive/trum_tweet_sentiment_analysis.csv')

# Preview dataset
print("Dataset Sample:\n", df.head())
print("\nColumns in dataset:", df.columns)

# Check for nulls
print("\nMissing values:\n", df.isnull().sum())

# Keep only necessary columns
df = df[['text', 'Sentiment']].dropna()


Dataset Sample:
                                                 text  Sentiment
0  RT @JohnLeguizamo: #trump not draining swamp b...          0
1  ICYMI: Hackers Rig FM Radio Stations To Play A...          0
2  Trump protests: LGBTQ rally in New York https:...          1
3  "Hi I'm Piers Morgan. David Beckham is awful b...          0
4  RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...          0

Columns in dataset: Index(['text', 'Sentiment'], dtype='object')

Missing values:
 text         0
Sentiment    0
dtype: int64


In [3]:
# Step 2: Text Cleaning Pipeline
# --------------------------------------------
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Helper function to clean text
def text_cleaning_pipeline(text, rule='lemmatize'):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)

    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)

    # Remove emojis and non-alphabet characters
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize
    tokens = text.split()

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization or Stemming
    if rule == 'lemmatize':
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    else:
        print("Invalid rule. Choose 'lemmatize'")

    return " ".join(tokens)

# Apply cleaning pipeline
df['clean_text'] = df['text'].apply(lambda x: text_cleaning_pipeline(x, rule='lemmatize'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [4]:
# Step 3: Train-Test Split
# --------------------------------------------
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['Sentiment']

# 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Step 4: TF-IDF Vectorization
# --------------------------------------------
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to numerical features
tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [6]:
# Step 6: Model Training and Evaluation
# --------------------------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predict on test set
y_pred = model.predict(X_test_tfidf)

# Print classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Classification Report:

              precision    recall  f1-score   support

           0       0.95      0.96      0.96    248563
           1       0.93      0.90      0.91    121462

    accuracy                           0.94    370025
   macro avg       0.94      0.93      0.93    370025
weighted avg       0.94      0.94      0.94    370025

