In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Helper Function for Text Cleaning:

Implement a Helper Function as per Text Preprocessing Notebook and Complete the following pipeline.

# Build a Text Cleaning Pipeline

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def text_cleaning_pipeline(dataset, rule="lemmatize"):
    """
    A pipeline for text cleaning which includes converting to lowercase,
    removing URLs, emojis, unwanted characters, stopwords, and performing either
    lemmatization or stemming.

    Args:
    - dataset (str): Input text data to be cleaned.
    - rule (str): Either 'lemmatize' or 'stem' to choose the text transformation method.

    Returns:
    - str: Cleaned text.
    """

    # Convert the input to lowercase to standardize the text
    data = dataset.lower()

    # Remove URLs using a regex pattern
    data = re.sub(r'http\S+|www\S+|https\S+', '', data)

    # Remove emojis using a regex pattern (removes non-ASCII characters)
    data = re.sub(r'[^\x00-\x7F]+', '', data)

    # Remove unwanted characters (non-alphanumeric characters except spaces)
    data = re.sub(r'[^a-zA-Z0-9\s]', '', data)

    # Tokenize the text into words
    tokens = word_tokenize(data)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Apply stemming or lemmatization
    if rule == "lemmatize":
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    elif rule == "stem":
        tokens = [stemmer.stem(word) for word in tokens]
    else:
        print("Pick between 'lemmatize' or 'stem' for the rule")

    # Join the tokens back into a single string
    return " ".join(tokens)


# Text Classification using Machine Learning Models


### üìù Instructions: Trump Tweet Sentiment Classification

1. **Load the Dataset**  
   Load the dataset named `"trump_tweet_sentiment_analysis.csv"` using `pandas`. Ensure the dataset contains at least two columns: `"text"` and `"label"`.

2. **Text Cleaning and Tokenization**  
   Apply a text preprocessing pipeline to the `"text"` column. This should include:
   - Lowercasing the text  
   - Removing URLs, mentions, punctuation, and special characters  
   - Removing stopwords  
   - Tokenization (optional: stemming or lemmatization)
   - "Complete the above function"

3. **Train-Test Split**  
   Split the cleaned and tokenized dataset into **training** and **testing** sets using `train_test_split` from `sklearn.model_selection`.

4. **TF-IDF Vectorization**  
   Import and use the `TfidfVectorizer` from `sklearn.feature_extraction.text` to transform the training and testing texts into numerical feature vectors.

5. **Model Training and Evaluation**  
   Import **Logistic Regression** (or any machine learning model of your choice) from `sklearn.linear_model`. Train it on the TF-IDF-embedded training data, then evaluate it using the test set.  
   - Print the **classification report** using `classification_report` from `sklearn.metrics`.


**Load the Dataset**

**Load the Dataset**

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/AI and ML/trum_tweet_sentiment_analysis.csv")

# Check the columns and if there are any missing values
print(df.columns)
print(df.isnull().sum())


Index(['text', 'Sentiment'], dtype='object')
text         0
Sentiment    0
dtype: int64


**Text Cleaning and Tokenization**

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function for text cleaning and preprocessing
def text_cleaning_pipeline(dataset, rule="lemmatize"):
    # Lowercasing the text
    data = dataset.lower()

    # Remove URLs, mentions, punctuation, and special characters
    data = re.sub(r'http\S+|www\S+|https\S+', '', data)  # Remove URLs
    data = re.sub(r'@\w+', '', data)  # Remove mentions (@username)
    data = re.sub(r'[^a-zA-Z0-9\s]', '', data)  # Remove punctuation/special characters

    # Tokenization
    tokens = word_tokenize(data)

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Apply lemmatization or stemming
    if rule == "lemmatize":
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    elif rule == "stem":
        tokens = [stemmer.stem(word) for word in tokens]
    else:
        print("Pick between 'lemmatize' or 'stem' for the rule")

    return " ".join(tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Apply the cleaning function to the 'text' column
df['cleaned_text'] = df['text'].apply(lambda x: text_cleaning_pipeline(x, rule='lemmatize'))

# Check the cleaned data
print(df.head())


                                                text  Sentiment  \
0  RT @JohnLeguizamo: #trump not draining swamp b...          0   
1  ICYMI: Hackers Rig FM Radio Stations To Play A...          0   
2  Trump protests: LGBTQ rally in New York https:...          1   
3  "Hi I'm Piers Morgan. David Beckham is awful b...          0   
4  RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...          0   

                                        cleaned_text  
0  rt trump draining swamp taxpayer dollar trip a...  
1  icymi hacker rig fm radio station play antitru...  
2    trump protest lgbtq rally new york bbcworld via  
3  hi im pier morgan david beckham awful donald t...  
4  rt tech firm suing buzzfeed publishing unverif...  


**Train-Test Split**

In [None]:
from sklearn.model_selection import train_test_split

# Prepare the data for training
X = df['cleaned_text']  # Features
y = df['Sentiment']  # Labels (sentiment)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**TF-IDF Vectorization**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)  # Fit and transform training data
X_test_tfidf = tfidf_vectorizer.transform(X_test)  # Transform test data


**Model Training and Evaluation**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)  # Train the model with the training data

# Make predictions using the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      0.97      0.97    248563
           1       0.94      0.91      0.93    121462

    accuracy                           0.95    370025
   macro avg       0.95      0.94      0.95    370025
weighted avg       0.95      0.95      0.95    370025

