In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Helper Function for Text Cleaning:

Implement a Helper Function as per Text Preprocessing Notebook and Complete the following pipeline.

# Build a Text Cleaning Pipeline

In [None]:
# Required Libraries
import re
import string
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Download NLTK Resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Text Cleaning Function
def text_cleaning_pipeline(dataset, rule="lemmatize"):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    cleaned_texts = []

    for text in dataset:
        # Lowercase
        data = text.lower()
        # Remove URLs
        data = re.sub(r'http\S+|www\S+|https\S+', '', data)
        # Remove emojis and non-ASCII
        data = data.encode('ascii', 'ignore').decode('utf-8')
        # Remove mentions, hashtags, punctuation, digits
        data = re.sub(r'@\w+|#\w+|[%s]' % re.escape(string.punctuation), '', data)
        data = re.sub(r'\d+', '', data)
        # Tokenize and remove stopwords
        tokens = [word for word in data.split() if word not in stop_words]
        # Lemmatize or Stem
        if rule == "lemmatize":
            tokens = [lemmatizer.lemmatize(word) for word in tokens]
        elif rule == "stem":
            tokens = [stemmer.stem(word) for word in tokens]
        else:
            print("Pick between lemmatize or stem")
        cleaned_texts.append(" ".join(tokens))

    return cleaned_texts

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


# Text Classification using Machine Learning Models


### 📝 Instructions: Trump Tweet Sentiment Classification

1. **Load the Dataset**  
   Load the dataset named `"trump_tweet_sentiment_analysis.csv"` using `pandas`. Ensure the dataset contains at least two columns: `"text"` and `"label"`.

2. **Text Cleaning and Tokenization**  
   Apply a text preprocessing pipeline to the `"text"` column. This should include:
   - Lowercasing the text  
   - Removing URLs, mentions, punctuation, and special characters  
   - Removing stopwords  
   - Tokenization (optional: stemming or lemmatization)
   - "Complete the above function"

3. **Train-Test Split**  
   Split the cleaned and tokenized dataset into **training** and **testing** sets using `train_test_split` from `sklearn.model_selection`.

4. **TF-IDF Vectorization**  
   Import and use the `TfidfVectorizer` from `sklearn.feature_extraction.text` to transform the training and testing texts into numerical feature vectors.

5. **Model Training and Evaluation**  
   Import **Logistic Regression** (or any machine learning model of your choice) from `sklearn.linear_model`. Train it on the TF-IDF-embedded training data, then evaluate it using the test set.  
   - Print the **classification report** using `classification_report` from `sklearn.metrics`.


In [None]:


# Load Dataset
df = pd.read_csv("/content/drive/MyDrive/AI&ML-Level6/Week - 8 - Getting Started with NLP - Text Pre - processing and Text Representations/trum_tweet_sentiment_analysis.csv")  # Replace with your file path if needed

# Check Column Names
print("Original Columns:", df.columns.tolist())

# Normalize column names
df.columns = df.columns.str.strip().str.lower()
print("Normalized Columns:", df.columns.tolist())

# 🧼 Check for required columns
if 'text' in df.columns and 'label' in df.columns:
    df = df[['text', 'label']].dropna()
elif 'text' in df.columns and 'sentiment' in df.columns:
    df = df[['text', 'sentiment']].dropna()
    df.rename(columns={'sentiment': 'label'}, inplace=True)
else:
    raise KeyError("Dataset must contain 'text' and 'label' or 'sentiment' columns.")

# Clean Text
df['cleaned_text'] = text_cleaning_pipeline(df['text'], rule="lemmatize")

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], df['label'], test_size=0.2, random_state=42
)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Evaluation
y_pred = model.predict(X_test_vec)
print("Classification Report:\n")
print(classification_report(y_test, y_pred))


Original Columns: ['text', 'Sentiment']
Normalized Columns: ['text', 'sentiment']
Classification Report:

              precision    recall  f1-score   support

           0       0.94      0.95      0.95    248563
           1       0.90      0.87      0.88    121462

    accuracy                           0.93    370025
   macro avg       0.92      0.91      0.91    370025
weighted avg       0.93      0.93      0.93    370025

