In [1]:
import pandas as pd 
import numpy as np


In [2]:
!pip install nltk




[notice] A new release of pip is available: 24.0 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import nltk

In [4]:
import nltk 
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tag import pos_tag

import re

In [5]:
df = pd.read_csv('train.tsv', sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [6]:
df.columns

Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')

In [7]:
df.isnull().sum()

PhraseId      0
SentenceId    0
Phrase        0
Sentiment     0
dtype: int64

## Tokenization 

In [8]:
df['Phrase'].isnull().sum()

0

In [None]:
# Required download 
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aarad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0    [A, series, of, escapades, demonstrating, the,...
1    [A, series, of, escapades, demonstrating, the,...
2                                          [A, series]
3                                                  [A]
4                                             [series]
Name: tokens, dtype: object

In [None]:
df.columns

In [13]:
df['tokens'] = df['Phrase'].apply(word_tokenize)
df['tokens'].head(5)

0    [A, series, of, escapades, demonstrating, the,...
1    [A, series, of, escapades, demonstrating, the,...
2                                          [A, series]
3                                                  [A]
4                                             [series]
Name: tokens, dtype: object

## POS Tagging

In [15]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aarad\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [16]:
df['pos_tags'] = df['tokens'].apply(pos_tag)

df[['Phrase', 'tokens', 'pos_tags']].head(5)

Unnamed: 0,Phrase,tokens,pos_tags
0,A series of escapades demonstrating the adage ...,"[A, series, of, escapades, demonstrating, the,...","[(A, DT), (series, NN), (of, IN), (escapades, ..."
1,A series of escapades demonstrating the adage ...,"[A, series, of, escapades, demonstrating, the,...","[(A, DT), (series, NN), (of, IN), (escapades, ..."
2,A series,"[A, series]","[(A, DT), (series, NN)]"
3,A,[A],"[(A, DT)]"
4,series,[series],"[(series, NN)]"


## Stop Words Removal

In [17]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aarad\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [18]:
df['tokens_no_stopwords'] = df['tokens'].apply(lambda x: [word for word in x if word.lower() not in stopwords.words('english')])
df[['Phrase', 'tokens', 'tokens_no_stopwords']].head(5)

Unnamed: 0,Phrase,tokens,tokens_no_stopwords
0,A series of escapades demonstrating the adage ...,"[A, series, of, escapades, demonstrating, the,...","[series, escapades, demonstrating, adage, good..."
1,A series of escapades demonstrating the adage ...,"[A, series, of, escapades, demonstrating, the,...","[series, escapades, demonstrating, adage, good..."
2,A series,"[A, series]",[series]
3,A,[A],[]
4,series,[series],[series]


## Stemming

In [19]:
stemmer = PorterStemmer()

df['stems'] = df['tokens_no_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])
df[['Phrase', 'tokens_no_stopwords', 'stems']].head(5)

Unnamed: 0,Phrase,tokens_no_stopwords,stems
0,A series of escapades demonstrating the adage ...,"[series, escapades, demonstrating, adage, good...","[seri, escapad, demonstr, adag, good, goos, al..."
1,A series of escapades demonstrating the adage ...,"[series, escapades, demonstrating, adage, good...","[seri, escapad, demonstr, adag, good, goos]"
2,A series,[series],[seri]
3,A,[],[]
4,series,[series],[seri]


## Lemmatization

In [20]:
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aarad\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\aarad\AppData\Roaming\nltk_data...


True

In [21]:
lemmatizer = WordNetLemmatizer()

df['lemmas'] = df['tokens_no_stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df[['Phrase', 'tokens_no_stopwords', 'lemmas']].head(5)

Unnamed: 0,Phrase,tokens_no_stopwords,lemmas
0,A series of escapades demonstrating the adage ...,"[series, escapades, demonstrating, adage, good...","[series, escapade, demonstrating, adage, good,..."
1,A series of escapades demonstrating the adage ...,"[series, escapades, demonstrating, adage, good...","[series, escapade, demonstrating, adage, good,..."
2,A series,[series],[series]
3,A,[],[]
4,series,[series],[series]


## TF_IDF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
df['processed_text'] = df['tokens_no_stopwords'].apply(lambda x: ' '.join(x))

In [25]:
tfidf = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf.fit_transform(df['processed_text'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)
print("Sample feature names:", tfidf.get_feature_names_out()[:10])


TF-IDF matrix shape: (156060, 1000)
Sample feature names: ['10' '20' '2002' '90' 'ability' 'able' 'across' 'act' 'acted' 'acting']


## Text Classification

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [28]:
X = tfidf_matrix
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5461040625400487
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.01      0.03      1416
           1       0.50      0.08      0.14      5527
           2       0.55      0.96      0.70     15639
           3       0.53      0.21      0.30      6707
           4       0.58      0.03      0.06      1923

    accuracy                           0.55     31212
   macro avg       0.58      0.26      0.25     31212
weighted avg       0.55      0.55      0.45     31212



Here’s a concise theory for each step in your notebook:

---

### **1. Tokenization**
**What:** Splitting text into individual words (tokens).  
**Why:** Makes it easier to analyze and process text data at the word level for further NLP tasks.

---

### **2. POS Tagging**
**What:** Assigning part-of-speech tags (noun, verb, adjective, etc.) to each token.  
**Why:** Helps understand the grammatical structure and meaning of sentences, useful for advanced NLP tasks.

---

### **3. Stop Words Removal**
**What:** Removing common words like "the", "is", "and" that do not carry significant meaning.  
**Why:** Reduces noise and focuses analysis on more meaningful words.

---

### **4. Stemming**
**What:** Reducing words to their root form (e.g., "playing" → "play").  
**Why:** Groups similar words together, improving the effectiveness of text analysis.

---

### **5. Lemmatization**
**What:** Converting words to their base or dictionary form (lemma), considering context.  
**Why:** More accurate than stemming, helps in normalizing words for better analysis.

---

### **6. TF-IDF Calculation**
**What:** Converts text into numerical features based on word importance (Term Frequency-Inverse Document Frequency).  
**Why:** Represents text data in a way that can be used for machine learning models.

---

### **7. Text Classification**
**What:** Using the processed features to train a machine learning model to predict sentiment.  
**Why:** Automates the task of classifying text (e.g., positive/negative sentiment) based on learned patterns.

---

**Summary:**  
These steps transform raw text into structured, meaningful features suitable for machine learning, enabling automated text analysis and classification.