# Task 7: Introduction to Natural Language (Text) Processing

## Section 1: Setup and Sample Dataset

### **Task 1**: Import Libraries and Sample Data
*Instruction*: Import the necessary libraries and define a sample dataset for sentiment classification.

In [1]:
import pandas as pd
import numpy as np
import nltk
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

nltk.download('punkt')
nltk.download('stopwords')

# Sample data
data = {
    'text': [
        'I love this movie. It was fantastic!',
        'Terrible acting and horrible plot.',
        'An excellent film with great characters.',
        'Worst movie I have ever seen.',
        'Absolutely wonderful! A must-watch.',
        'It was okay, nothing special.',
        'Bad movie, waste of time.',
        'Pretty good, I liked it.',
        'Not great, but not terrible.',
        'Awful! Never again.'
    ],
    'label': [1, 0, 1, 0, 1, 1, 0, 1, 0, 0]  # 1 = positive, 0 = negative
}

df = pd.DataFrame(data)
df.head()
import pandas as pd
import numpy as np
import nltk
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Section 2: Text Preprocessing

### **Task 2**: Clean the Text

*Instruction*: Lowercase the text, remove punctuation, stopwords, and tokenize the sentences.


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt') # Download Punkt Sentence Tokenizer if not already present.
nltk.download('stopwords') # Download stopwords if not already present.
nltk.download('punkt_tab') # Download Punkt Sentence Tokenizer if not already present.

stop_words = set(stopwords.words('english'))

def preprocess(text):
    """
    Lowercase the text, remove punctuation, stopwords, and tokenize the sentences.

    Args:
        text (str): The input text.

    Returns:
        str: The cleaned text.
    """
    text = text.lower() # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    tokens = word_tokenize(text) # Tokenize into words
    filtered = [word for word in tokens if word not in stop_words] # Remove stop words
    return ' '.join(filtered)  # Join the words back into a string

df['cleaned'] = df['text'].apply(preprocess)
df[['text', 'cleaned']].head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,text,cleaned
0,I love this movie. It was fantastic!,love movie fantastic
1,Terrible acting and horrible plot.,terrible acting horrible plot
2,An excellent film with great characters.,excellent film great characters
3,Worst movie I have ever seen.,worst movie ever seen
4,Absolutely wonderful! A must-watch.,absolutely wonderful mustwatch


## Section 3: Text Vectorization

### **Task 3**: Convert Text to Numerical Features

*Instruction*: Use both Bag of Words and TF-IDF vectorization to convert the cleaned text.


In [5]:
# Bag of Words
cv = CountVectorizer()
X_bow = cv.fit_transform(df['cleaned'])

# TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['cleaned'])
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Bag of Words
cv = CountVectorizer()
X_bow = cv.fit_transform(df['cleaned'])

# TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df['cleaned'])

## Section 4: Train a Classifier

### **Task 4**: Sentiment Classification with Naive Bayes

*Instruction*: Split the dataset, train a classifier using both feature sets, and evaluate the performance.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['label'], test_size=0.3, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

# Split data for TF-IDF features
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, df['label'], test_size=0.3, random_state=42)

# Train and evaluate with TF-IDF
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train_tfidf)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

print("TF-IDF Results:")
print("Confusion Matrix:\n", confusion_matrix(y_test_tfidf, y_pred_tfidf))
print("\nClassification Report:\n", classification_report(y_test_tfidf, y_pred_tfidf))

# Split data for Bag-of-Words features
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(X_bow, df['label'], test_size=0.3, random_state=42)

# Train and evaluate with Bag-of-Words
model_bow = MultinomialNB()
model_bow.fit(X_train_bow, y_train_bow)
y_pred_bow = model_bow.predict(X_test_bow)

print("\nBag-of-Words Results:")
print("Confusion Matrix:\n", confusion_matrix(y_test_bow, y_pred_bow))
print("\nClassification Report:\n", classification_report(y_test_bow, y_pred_bow))

Confusion Matrix:
 [[0 2]
 [0 1]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3

TF-IDF Results:
Confusion Matrix:
 [[0 2]
 [0 1]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3


Bag-of-Words Results:
Confusion Matrix:
 [[1 1]
 [0 1]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Section 5: Mini Challenge – Classify Your Own Text

### **Task 5**:  User Input Prediction

*Instruction*: Write a function that allows the user to enter a text and receive a prediction from the trained model.


In [7]:
def predict_sentiment(text):
    cleaned = preprocess(text)
    vectorized = tfidf.transform([cleaned])
    prediction = model.predict(vectorized)
    return "Positive" if prediction[0] == 1 else "Negative"

# Try it out
predict_sentiment("The movie was so good and exciting!")
def predict_sentiment(text):
    cleaned = preprocess(text)
    vectorized = tfidf.transform([cleaned])
    prediction = model_tfidf.predict(vectorized)  # Use the TF-IDF model
    return "Positive" if prediction[0] == 1 else "Negative"

# Try it out
user_input = input("Enter some text: ")
sentiment = predict_sentiment(user_input)
print(f"Sentiment: {sentiment}")

Enter some text: 5
Sentiment: Positive
