In [None]:
!pip install nltk --quiet

In [None]:
import pandas as pd
import string
import nltk # Library for Text Preprocessing
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [None]:
nltk.download('punkt')  # For Tokenization
nltk.download('stopwords')  # For Stopwords
nltk.download('wordnet')  # For Lemmatization
nltk.download('averaged_perceptron_tagger') # For POS Tagging

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# Sample text
text = "hello this is srishti's laptop @sit,pune."

> **Tokenization** is the process of splitting text into smaller units, typically words or sentences.

> **Word Tokenization**: Splits text into individual words. This is often the first step in text processing.
Example: "Hello world!" becomes ['Hello', 'world'].

>** Sentence Tokenization**: Splits text into sentences. Useful for text analysis at the sentence level.
Example: "Hello world! How are you?" becomes ['Hello world!', 'How are you?'].

>**Why**: Tokenization helps in breaking down the text into manageable chunks for further analysis.

In [None]:
# Tokenization
words = word_tokenize(text)
sentences = sent_tokenize(text)

In [None]:
words

['hello',
 'this',
 'is',
 'srishti',
 "'s",
 'laptop',
 '@',
 'sit',
 ',',
 'pune',
 '.']

In [None]:
sentences

["hello this is srishti's laptop @sit,pune."]

>**Lowercasing** converts all characters in the text to lowercase.

> **Why**: Ensures uniformity by treating "Word" and "word" as the same word, which is important for consistent analysis and avoiding duplicate entries.

> **Removing punctuation** and **special characters** involves filtering out symbols like commas, periods, and other non-alphanumeric characters.

> **Why**: Punctuation and special characters may not contribute significant meaning to the text analysis and can be removed to simplify the data.

In [None]:
# Lowercasing and removing punctuation
words = [word.lower() for word in words if word.isalnum()]
words

['hello', 'this', 'is', 'srishti', 'laptop', 'sit', 'pune']

>**Stop words** are common words (e.g., "the," "is," "in") that are often removed from text data because they carry less meaningful information in the context of text analysis.

>**Why**: Removing stop words can reduce the size of the data and improve the efficiency of text processing, focusing on more meaningful terms.

In [None]:
# Removing stop words
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]
filtered_words

['hello', 'srishti', 'laptop', 'sit', 'pune']

>**Lemmatization** is a more sophisticated approach than stemming. It reduces words to their base or dictionary form (lemma) based on their context in the sentence.

>**Why**: Lemmatization provides a more accurate normalization of words, as it considers the part of speech and context, unlike stemming which may produce non-words.
Example:     
"running" → "run" (verb)   
"better" → "good" (adjective)

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
lemmatized_words

['hello', 'srishti', 'laptop', 'sit', 'pune']

> **Stemming** reduces words to their root form. For example, "running," "runner," and "runs" might all be reduced to "run."

> **Why**: Stemming helps in grouping similar words under a common root, which can be useful for tasks like text classification and information retrieval.
Example: "running" → "run"

In [None]:
# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in lemmatized_words]
stemmed_words

['hello', 'srishti', 'laptop', 'sit', 'pune']

> **POS tagging** assigns grammatical labels to words in a text, such as noun, verb, adjective, etc.

> **Why**: POS tagging provides insights into the syntactic structure of sentences, which can be useful for more complex tasks like named entity recognition and syntactic parsing.

In [None]:
# POS Tagging
tagged_words = pos_tag(filtered_words)
tagged_words

[('hello', 'NN'),
 ('srishti', 'JJ'),
 ('laptop', 'JJ'),
 ('sit', 'NN'),
 ('pune', 'NN')]

In [None]:
df = pd.read_csv("Reddit_Data.csv")
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [None]:
# Renaming the column 'clean_comment' to 'comments'
df.rename(columns={"clean_comment": "comments"}, inplace=True)

In [None]:
df.head()

Unnamed: 0,comments,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [None]:
df["comments"] = df["comments"].astype(str)

In [None]:
# Initialize preprocessors
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
# Preprocessing function
def preprocess_text(text):
  # Tokenization
  words = word_tokenize(text)

  # Lowercasing and removing punctuation
  words = [word.lower() for word in words if word.isalnum()]

  # Removing stop words
  filtered_words = [word for word in words if word not in stop_words]

  # Stemming
  stemmed_words = [stemmer.stem(word) for word in filtered_words]

  # Lemmatization
  lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

  return ' '.join(lemmatized_words)

In [None]:
df["comments"] = df["comments"].apply(preprocess_text)

In [None]:
df["comments"].head()

Unnamed: 0,comments
0,family mormon never tried explain still stare ...
1,buddhism much lot compatible christianity espe...
2,seriously say thing first get complex explain ...
3,learned want teach different focus goal wrappi...
4,benefit may want read living buddha living chr...


In [None]:
df["category"].replace({-1.0: "Negative", 0.0: "Neutral", 1.0: "Positive"}, inplace=True)

In [None]:
X = df["comments"].to_numpy()
y = df["category"].to_numpy()

In [None]:
df.head()

Unnamed: 0,comments,category
0,family mormon never tried explain still stare ...,Positive
1,buddhism much lot compatible christianity espe...,Positive
2,seriously say thing first get complex explain ...,Negative
3,learned want teach different focus goal wrappi...,Neutral
4,benefit may want read living buddha living chr...,Positive


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

>**TF-IDF** is a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents (corpus).

> **Term Frequency (TF)**: Measures how frequently a term occurs in a document.    
> **Inverse Document Frequency (IDF)**: Measures how important a term is by considering how common it is across all documents.

In [None]:
tfif = TfidfVectorizer()
X_train = tfif.fit_transform(X_train)
X_test = tfif.transform(X_test)

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")

Accuracy: 54.30%
