In [2]:
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics



In [3]:
# Load dataset
data = pd.read_csv('data7.tsv', sep='\t')
data

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
156055,156056,8544,Hearst 's,2
156056,156057,8544,forced avuncular chortles,1
156057,156058,8544,avuncular chortles,3
156058,156059,8544,avuncular,2


In [4]:
# Display first few rows to understand the structure
data.head()


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
# Get dataset information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [6]:
# Count occurrences of each unique sentiment
data['Sentiment'].value_counts()

Sentiment
2    79582
3    32927
1    27273
4     9206
0     7072
Name: count, dtype: int64

data['Sentiment']: This column likely contains categorical values representing different sentiment labels (e.g., 0 = Negative, 1 = Neutral, 2 = Positive).




In [7]:
# Tokenizer to remove symbols and numbers
tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')


Only letters (a-z, A-Z) and numbers (0-9) are kept.
Symbols, punctuation, and special characters (e.g., ! @ # $ % & *) are removed.

In [8]:
# Text preprocessing using CountVectorizer
# Convert text data into numerical feature vectors
cv = CountVectorizer(lowercase=True, stop_words='english', tokenizer=tokenizer.tokenize)
text_counts = cv.fit_transform(data['Phrase'])
print(text_counts)




<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 598944 stored elements and shape (156060, 14988)>
  Coords	Values
  (0, 11671)	1
  (0, 4517)	1
  (0, 3444)	1
  (0, 294)	1
  (0, 5735)	2
  (0, 5751)	1
  (0, 5512)	1
  (0, 9065)	1
  (0, 593)	1
  (0, 584)	1
  (0, 12673)	1
  (1, 11671)	1
  (1, 4517)	1
  (1, 3444)	1
  (1, 294)	1
  (1, 5735)	1
  (1, 5751)	1
  (2, 11671)	1
  (4, 11671)	1
  (5, 4517)	1
  (5, 3444)	1
  (5, 294)	1
  (5, 5735)	1
  (5, 5751)	1
  (7, 4517)	1
  :	:
  (156050, 11305)	1
  (156050, 9054)	1
  (156051, 11305)	1
  (156051, 9054)	1
  (156052, 11305)	1
  (156053, 11281)	1
  (156053, 1281)	1
  (156053, 5252)	1
  (156053, 6156)	1
  (156053, 1006)	1
  (156053, 2271)	1
  (156054, 11281)	1
  (156054, 5252)	1
  (156054, 6156)	1
  (156054, 1006)	1
  (156054, 2271)	1
  (156055, 11281)	1
  (156055, 6156)	1
  (156056, 5252)	1
  (156056, 1006)	1
  (156056, 2271)	1
  (156057, 1006)	1
  (156057, 2271)	1
  (156058, 1006)	1
  (156059, 2271)	1


lowercase=True → Converts all text to lowercase

stop_words='english' → Removes common English words like "the", "and", "is"

tokenizer=tokenizer.tokenize → Uses  to remove symbols and extract words.

fit: Learns vocabulary from the 'Phrase' column in the dataset.

transform: Converts the text into a numerical matrix.

text_counts: Stores the sparse matrix representation of text.

In [9]:
# Split dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(text_counts, data['Sentiment'], test_size=0.3, random_state=1)

In [10]:
# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [11]:
# Make predictions on the test set
predicted = clf.predict(X_test)

In [12]:
# Calculate and print model accuracy
accuracy = metrics.accuracy_score(y_test, predicted)
print("MultinomialNB Accuracy:", accuracy)

MultinomialNB Accuracy: 0.6049169122986885


Term Frequency (TF) → Measures how often a word appears in a document.

Inverse Document Frequency (IDF) → Reduces the weight of words that appear frequently across all documents (e.g., "the", "is", "and").

In [13]:
# Use TF-IDF Vectorizer to transform text data
tfidf = TfidfVectorizer()
text_tfidf = tfidf.fit_transform(data['Phrase'])

TfidfVectorizer :
Converts text to lowercase.
Removes stopwords (if specified).
Applies TF-IDF transformation.

In [14]:
# Print transformed text feature matrix
print(text_tfidf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 972099 stored elements and shape (156060, 15240)>
  Coords	Values
  (0, 11837)	0.1761994204821687
  (0, 9227)	0.27061683772839323
  (0, 4577)	0.278538658922562
  (0, 3490)	0.2485059095620638
  (0, 13505)	0.17690005957760713
  (0, 288)	0.251134096800077
  (0, 13503)	0.08982508036989033
  (0, 14871)	0.1354415412970302
  (0, 7217)	0.17522921677393963
  (0, 5821)	0.2625302862532789
  (0, 5323)	0.20344769269023563
  (0, 5837)	0.22883807138484064
  (0, 529)	0.1614381914318891
  (0, 5595)	0.265796263188737
  (0, 12424)	0.1381592967010513
  (0, 14888)	0.28701927784529135
  (0, 9204)	0.19301332592202286
  (0, 602)	0.26341877863818697
  (0, 1879)	0.11034437734762885
  (0, 9085)	0.1898515417082945
  (0, 593)	0.22068902883834374
  (0, 13681)	0.07615285026452821
  (0, 8807)	0.1353879543646446
  (0, 12857)	0.12785637560254456
  (1, 11837)	0.29125926935214375
  :	:
  (156050, 625)	0.2115725833396903
  (156050, 11465)	0.670263619653983
  (1

In [15]:
!pip install nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Download required NLTK data files
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')  # Download the averaged_perceptron_tagger resource
# Sample document
documents = [
    "Natural Language Processing is an interesting field of study. It involves text analysis, linguistics, and machine learning.",
    "Text preprocessing is an essential step in NLP. It includes tokenization, stop word removal, stemming, and lemmatization."
]

# Tokenization
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]

# POS Tagging
pos_tagged_docs = [nltk.pos_tag(tokens) for tokens in tokenized_docs]

# Stop word removal
stop_words = set(stopwords.words('english'))
filtered_docs = [[word for word in tokens if word not in stop_words] for tokens in tokenized_docs]

# Stemming
stemmer = PorterStemmer()
stemmed_docs = [[stemmer.stem(word) for word in tokens] for tokens in filtered_docs]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_docs = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in filtered_docs]

# Convert list of words back to sentences for TF-IDF calculation
processed_docs = [" ".join(doc) for doc in lemmatized_docs]

# TF-IDF Calculation
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_docs)
feature_names = vectorizer.get_feature_names_out()
tfidf_array = tfidf_matrix.toarray()

# Display TF-IDF values
for i, doc in enumerate(tfidf_array):
    print(f"Document {i+1} TF-IDF scores:")
    for word, score in zip(feature_names, doc):
        print(f"{word}: {score:.4f}")
    print("\n")


Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


Document 1 TF-IDF scores:
analysis: 0.2948
essential: 0.0000
field: 0.2948
includes: 0.0000
interesting: 0.2948
involves: 0.2948
language: 0.2948
learning: 0.2948
lemmatization: 0.0000
linguistics: 0.2948
machine: 0.2948
natural: 0.2948
nlp: 0.0000
preprocessing: 0.0000
processing: 0.2948
removal: 0.0000
stemming: 0.0000
step: 0.0000
stop: 0.0000
study: 0.2948
text: 0.2098
tokenization: 0.0000
word: 0.0000


Document 2 TF-IDF scores:
analysis: 0.0000
essential: 0.2948
field: 0.0000
includes: 0.2948
interesting: 0.0000
involves: 0.0000
language: 0.0000
learning: 0.0000
lemmatization: 0.2948
linguistics: 0.0000
machine: 0.0000
natural: 0.0000
nlp: 0.2948
preprocessing: 0.2948
processing: 0.0000
removal: 0.2948
stemming: 0.2948
step: 0.2948
stop: 0.2948
study: 0.0000
text: 0.2098
tokenization: 0.2948
word: 0.2948


