<a href="https://www.kaggle.com/code/lestiessam/beginner-in-nlp?scriptVersionId=240489527" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import nltk

def format_sentence(sent):
    return({word: True for word in nltk.word_tokenize(sent)})

format_sentence("Life is beautiful so Enjo everymoment you have.")

In [None]:
pos = []
with open("/kaggle/input/sentimental-analysis-nlp/pos_tweets.txt") as f:
    for i in f:
        pos.append([format_sentence(i), 'pos'])


pos[0]

In [None]:
neg = []
with open("/kaggle/input/sentimental-analysis-nlp/neg_tweets.txt") as f:
    for i in f:
        neg.append([format_sentence(i), 'neg'])

neg[0]

In [None]:
training = pos[:int((.9)*len(pos))] + neg[:int((.9)*len(neg))]

In [None]:
test = neg[int((.1)*len(pos)):] + neg[int((.1)*len(neg)):]

# Building a Classifier

In [None]:
from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(training)

In [None]:
classifier.show_most_informative_features()

# Classification

In [None]:
example1 = "this workshop is awesome."
example2 = "This workshop is not good"

print(classifier.classify(format_sentence(example1)))
print(classifier.classify(format_sentence(example2)))

# Accuracy

In [None]:
from nltk.classify.util import accuracy

print(accuracy(classifier, test))

In [None]:
import re
re1 = re.compile('python')
print(bool(re1.match('Python')))

# NLTK Parts of Speech Tagger

In [None]:
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
import nltk 

text = nltk.word_tokenize("Python is an awesome language!")
nltk.pos_tag(text)

In [None]:
import nltk
nltk.download('tagsets_json')
nltk.download('tagsets')
nltk.help.upenn_tagset('JJ')

In [None]:
from nltk.corpus import brown

brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])

# Bigram Models

In [None]:
bigram_tagger = nltk.BigramTagger(brown_tagged_sents)
bigram_tagger.tag(brown_sents[2007])

# Normalizing Text

In [None]:
raw = "OMG, Natural Language Processing is so cool and I'm really enjoying this workshop!"
tokens = nltk.word_tokenize(raw)
tokens = [i.lower() for i in tokens]
tokens

# Stemming

In [None]:
lancaster = nltk.LancasterStemmer()
stems = [lancaster.stem(i) for i in tokens]
stems

In [None]:
porter = nltk.PorterStemmer()
stem = [porter.stem(i) for i in tokens]
stem

# Lemmatization

In [None]:
from nltk import WordNetLemmatizer

lemma = nltk.WordNetLemmatizer()
text = "Women in technology are amazing at coding"
ex = [i.lower() for i in text.split()]
lemmas = [lemma.lemmatize(i) for i in ex]
lemmas

# Intermediate Natural Language Processing

In [None]:
from nltk.corpus import wordnet as wn

In [None]:
print(wn.synsets('motorcar'))

In [None]:
print(wn.synset('car.n.01').lemma_names())

In [None]:
print(wn.synset('car.n.01').definition())

# SentiWordNet

In [None]:
from nltk.corpus import sentiwordnet as swn
cat = swn.senti_synset('car.n.03')

In [None]:
cat.pos_score()

In [None]:
cat.neg_score()

In [None]:
cat.obj_score()

In [None]:
cat.unicode_repr()

# Chunking

In [None]:
import nltk
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN")]

In [None]:
pattern = "NP: {<DT>?<JJ>*<NN>}" 

In [None]:
NPChunker = nltk.RegexpParser(pattern)

In [None]:
result = NPChunker.parse(sentence)

In [None]:
result.draw

# Named Entity Extraction

In [None]:
import spacy 
import pandas as pd

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
review = "Columbia University was founded in 1754 as King's College by royal charter of King George II of England. It is the oldest institution of higher learning in the state of New York and the fifth oldest in the United States. Controversy preceded the founding of the College, with various groups competing to determine its location and religious affiliation. Advocates of New York City met with success on the first point, while the Anglicans prevailed on the latter. However, all constituencies agreed to commit themselves to principles of religious liberty in establishing the policies of the College. In July 1754, Samuel Johnson held the first classes in a new schoolhouse adjoining Trinity Church, located on what is now lower Broadway in Manhattan. There were eight students in the class. At King's College, the future leaders of colonial society could receive an education designed to 'enlarge the Mind, improve the Understanding, polish the whole Man, and qualify them to support the brightest Characters in all the elevated stations in life.'' One early manifestation of the institution's lofty goals was the establishment in 1767 of the first American medical school to grant the M.D. degree."

In [None]:
doc = nlp(review)

In [None]:
sentences = [sentence.orth_ for sentence in doc.sents] # list of sentences
print(f"There were {len(sentences)} sentences found.")

In [None]:
nounphrases = [[n.text, n.root.head.orth_] for n in doc.noun_chunks]
print("There were {} noun phases found.".format(len(nounphrases)))

In [None]:
entities = list(doc.ents) # converts entities into a list
print("There were {} entities found".format(len(entities)))

In [None]:
orgs_and_people = [entity.text for entity in entities if entity.label_ in ['ORG', 'PERSON']]
pd.DataFrame(orgs_and_people)

In [None]:
import nltk
import re
content = "Starbucks has not been doing well lately"

In [None]:
tokenized = nltk.word_tokenize(content)
tagged = nltk.pos_tag(tokenized)
print(tagged)

In [None]:
nltk.download('maxent_ne_chunker_tab')

In [None]:
namedEnt = nltk.ne_chunk(tagged)
namedEnt.draw

In [None]:
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.relextract.extract_rels('ORG','LOC', doc, corpus='ieer', pattern=IN):
        print(nltk.sem.relextract.rtuple(rel))

# Sentiment Analysis

In [None]:
import urllib.request

In [None]:
test_file = '/kaggle/input/sentiment/test_data.csv'
train_file = '/kaggle/input/sentimental-analysis-nlp/train_data.csv'

In [None]:
import pandas as pd

test_data_df = pd.read_csv(test_file, header=None, delimiter='\t', quoting=3)
test_data_df.columns = ['Text']

In [None]:
train_data_df = pd.read_csv(train_file, header=None, delimiter='\t', quoting=3)
train_data_df.columns = ["Sentiment","Text"]

In [None]:
test_data_df.head()

In [None]:
train_data_df.head()

**Preparing the Data**

In [None]:
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer

In [None]:
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = [stemmer.stem(item) for item in tokens]
    return(stemmed)

In [None]:
def tokenize(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return(stems)

In [None]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 85
)

In [None]:
features = vectorizer.fit_transform(
    train_data_df.Text.tolist() + test_data_df.Text.tolist()
)

In [None]:
features_nd = features.toarray()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_nd[0:len(train_data_df)], train_data_df.Sentiment, train_size=0.85, random_state=1234)

In [None]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()

In [None]:
log_model = log_model.fit(X=X_train, y=y_train)

In [None]:
y_pred = log_model.predict(X_test)

In [None]:
y_pred

**Accuracy**

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

**Retraining**

In [None]:
log_model = LogisticRegression()
log_model = log_model.fit(X=features_nd[0:len(train_data_df)], y= train_data_df.Sentiment)
test_pred = log_model.predict(features_nd[len(train_data_df):])

In [None]:
test_pred

In [None]:
import random
spl = random.sample(range(len(test_pred)), 50)
for text, sentiment in zip(test_data_df.Text[spl], test_pred[spl]):
    print(sentiment, text)