# NLP

### import required packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

### load the data

In [34]:
reviews = [
    "this is a good restaurant. when I reached here, the staff greeted me with smile.",
    "the service here is not so good",
    "the staff is of helping nature",
    "this restaurant is worst",
    "very very bad",
    "these restaurants are recommended"
]

### pre-processing

#### sentence tokenization

In [35]:
from nltk import sent_tokenize

sentences = []
for review in reviews:
    sentences.extend(sent_tokenize(review))
sentences

['this is a good restaurant.',
 'when I reached here, the staff greeted me with smile.',
 'the service here is not so good',
 'the staff is of helping nature',
 'this restaurant is worst',
 'very very bad',
 'these restaurants are recommended']

#### word tokenization

In [8]:
from nltk.tokenize import word_tokenize

all_words = []
for sentence in sentences:
    words = word_tokenize(sentence)
    all_words.extend([word.lower() for word in words])
len(all_words)

38

In [10]:
def get_counts(words):
    count_dictionary = {}
    for word in words:
        if count_dictionary.get(word) is None:
            count_dictionary[word] = 1
        else:
            count_dictionary[word] += 1
    print(count_dictionary)

get_counts(all_words)

{'this': 2, 'is': 4, 'a': 1, 'good': 2, 'restaurant': 2, '.': 2, 'when': 1, 'i': 1, 'reached': 1, 'here': 2, ',': 1, 'the': 3, 'staff': 2, 'greeted': 1, 'me': 1, 'with': 1, 'smile': 1, 'service': 1, 'not': 1, 'so': 1, 'of': 1, 'helping': 1, 'nature': 1, 'worst': 1, 'very': 2, 'bad': 1}


#### removing stop words

In [12]:
from nltk.corpus import stopwords

# get the pre-defined stopwords from english language
stop_words_english = stopwords.words('english')
len(stop_words_english)

179

In [15]:
# get the pre-defined stopwords from  arabic
stop_words_arabic = stopwords.words('arabic')
len(stop_words_arabic)

754

In [19]:
# get the pre-defined stopwords from germany
stop_words_germany = stopwords.words('german')
len(stop_words_germany)

232

In [42]:
from nltk.tokenize import word_tokenize

# remove the stop words
all_words = []

# remove the special symbols
words_to_remove = [',', '.', '//', '?', '!', '@', '#', '$', '%', '^', '*', "'", '"']
words_to_remove.extend(stop_words_english)

for sentence in sentences:
    words = word_tokenize(sentence.lower())
    all_words.extend([word.strip() for word in words if word.strip() not in words_to_remove])
    
all_words = list(set(all_words))
all_words

['recommended',
 'restaurant',
 'restaurants',
 'worst',
 'nature',
 'service',
 'smile',
 'bad',
 'good',
 'greeted',
 'helping',
 'staff',
 'reached']

#### lemmatization

In [43]:
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# create a lemmatizer
lemmatizer = WordNetLemmatizer()

# remove the stop words
all_words = []

# remove the special symbols
words_to_remove = [',', '.', '//', '?', '!', '@', '#', '$', '%', '^', '*', "'", '"']
words_to_remove.extend(stop_words_english)

for sentence in sentences:
    words = word_tokenize(sentence.lower())
    words = [lemmatizer.lemmatize(word.strip()) for word in words if word.strip() not in words_to_remove]
    all_words.extend(set(words))

# find the unique words
all_words = list(set(all_words))
len(all_words)

12