In [1]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


# 1.Sentiment Analysis

In [1]:
# Sentiment Analysis using IMDb dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [3]:
# Load IMDb dataset
imdb_data = pd.read_csv('NLP dataset/IMDB Dataset.csv')
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Split data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(imdb_data['review'], imdb_data['sentiment'], test_size=0.2, random_state=42)

In [5]:
# Vectorize the text data
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

In [6]:
# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(train_vectors, train_labels)

In [7]:
# Predict sentiment on test data
predictions = classifier.predict(test_vectors)

In [9]:
predictions

array(['positive', 'positive', 'negative', ..., 'negative', 'negative',
       'negative'], dtype='<U8')

In [8]:
# Evaluate accuracy
accuracy = accuracy_score(test_labels, predictions)
print(f'Sentiment Analysis Accuracy: {accuracy}')

Sentiment Analysis Accuracy: 0.8488


### Unseen text-Prediction

In [12]:
# Vectorize the unseen data

unseen_data = ["I loved the movie! The plot was amazing.", 
               "The acting was terrible, and the movie was boring."]

unseen_vectors = vectorizer.transform(unseen_data)

In [13]:
# Predict sentiment on unseen data
predictions = classifier.predict(unseen_vectors)

In [17]:
# Display sentiment for each review in the unseen data
for i, review in enumerate(unseen_data):
    sentiment = 'Positive' if predictions[i] == 'positive' else 'Negative'
    print(f'Review: {review[:50]}... \nPredicted Sentiment: {sentiment}')
    print()

Review: I loved the movie! The plot was amazing.... 
Predicted Sentiment: Positive

Review: The acting was terrible, and the movie was boring.... 
Predicted Sentiment: Negative



# 2. Text Classification

In [18]:
# Text Classification using 20 Newsgroups dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [19]:
# Load 20 Newsgroups dataset
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))


In [20]:
# Split data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(newsgroups_data.data, newsgroups_data.target, test_size=0.2, random_state=42)


In [21]:
# Vectorize the text data
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

In [22]:
# Train a Support Vector Machine (SVM) classifier
classifier = SVC()
classifier.fit(train_vectors, train_labels)


In [27]:
# Predict categories on test data
predictions = classifier.predict(test_vectors)
predictions

array([ 9, 12, 14, ...,  0,  0, 12])

In [24]:
# Evaluate accuracy
accuracy = accuracy_score(test_labels, predictions)
print(f'Text Classification Accuracy: {accuracy}')

Text Classification Accuracy: 0.716710875331565


In [29]:
# Display predicted categories for each document
for i, document in enumerate(test_data):
    category = newsgroups_data.target_names[predictions[i]]
    print(f'Document: {document[:50]}... \nPredicted Category: {category}')
    print()


Document: 


	The runner can leave his base at any time.  If... 
Predicted Category: rec.sport.baseball

Document: 
Well, it's not an FTP site, but I got an 800 numb... 
Predicted Category: sci.electronics

Document: Hi,
    I was reading through "The Spaceflight Han... 
Predicted Category: sci.space

Document: I was a graduate student in the early 1980s, and w... 
Predicted Category: talk.politics.misc

Document: FREE-ENERGY TECHNOLOGY
                       by R... 
Predicted Category: sci.space

Document: but whoever listens to me will live in safety and ... 
Predicted Category: rec.motorcycles

Document: The xgolf program was an April Fool's joke <sigh>.... 
Predicted Category: sci.space

Document: 
We also cannot fail to note the intense suffering... 
Predicted Category: soc.religion.christian

Document: The chemicals are gone Thanks for all the response... 
Predicted Category: sci.electronics

Document: 
... 
Predicted Category: rec.autos

Document: 

Maybe I should point out tha

# 