In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer, RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import string


In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
para="""
Founded in the early 18th century as a pearling and fishing settlement, Dubai became a regional trade hub in the 20th century after declaring itself a free port (1901) and extending the Creek (1961).[11] Modest oil revenue helped accelerate Dubai's development from the 1960s to the 1990s when the city started to diversify its economy.[11] In 2018, oil production contributed less than 1% to the emirate's GDP.[12]

Rapid construction since the 1990s has produced one of the world's densest skylines,[13] including the world's tallest building, the Burj Khalifa. Extensive land-reclamation projects have added more than 300 kilometres (190 mi) of artificial coastline. The city has a large real estate market, especially in the luxury segment.
"""

In [None]:
print(para)


Founded in the early 18th century as a pearling and fishing settlement, Dubai became a regional trade hub in the 20th century after declaring itself a free port (1901) and extending the Creek (1961).[11] Modest oil revenue helped accelerate Dubai's development from the 1960s to the 1990s when the city started to diversify its economy.[11] In 2018, oil production contributed less than 1% to the emirate's GDP.[12]

Rapid construction since the 1990s has produced one of the world's densest skylines,[13] including the world's tallest building, the Burj Khalifa. Extensive land-reclamation projects have added more than 300 kilometres (190 mi) of artificial coastline. The city has a large real estate market, especially in the luxury segment.



In [None]:
sentences=sent_tokenize(para)
sentences

['\nFounded in the early 18th century as a pearling and fishing settlement, Dubai became a regional trade hub in the 20th century after declaring itself a free port (1901) and extending the Creek (1961).',
 "[11] Modest oil revenue helped accelerate Dubai's development from the 1960s to the 1990s when the city started to diversify its economy.",
 "[11] In 2018, oil production contributed less than 1% to the emirate's GDP.",
 "[12]\n\nRapid construction since the 1990s has produced one of the world's densest skylines,[13] including the world's tallest building, the Burj Khalifa.",
 'Extensive land-reclamation projects have added more than 300 kilometres (190 mi) of artificial coastline.',
 'The city has a large real estate market, especially in the luxury segment.']

In [None]:
lemmatizer=WordNetLemmatizer()

In [None]:
corpus=[]
for i in range( len(sentences)):
  review=re.sub('[^a-zA-Z]',' ',sentences[i])
  review=review.lower()
  review=review.split()
  review=[lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
  review=' '.join(review)
  corpus.append(review)

In [None]:
corpus

['founded early th century pearling fishing settlement dubai became regional trade hub th century declaring free port extending creek',
 'modest oil revenue helped accelerate dubai development city started diversify economy',
 'oil production contributed less emirate gdp',
 'rapid construction since produced one world densest skyline including world tallest building burj khalifa',
 'extensive land reclamation project added kilometre mi artificial coastline',
 'city large real estate market especially luxury segment']

**Count Vectorizer**

In [None]:
cv=CountVectorizer(ngram_range=(1,2))

In [None]:
x=cv.fit_transform(corpus)
x[0].toarray()

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 2, 2, 1, 1, 0, 0, 0]])

In [None]:
cv.get_feature_names_out()

array(['accelerate', 'accelerate dubai', 'added', 'added kilometre',
       'artificial', 'artificial coastline', 'became', 'became regional',
       'building', 'building burj', 'burj', 'burj khalifa', 'century',
       'century declaring', 'century pearling', 'city', 'city large',
       'city started', 'coastline', 'construction', 'construction since',
       'contributed', 'contributed less', 'creek', 'declaring',
       'declaring free', 'densest', 'densest skyline', 'development',
       'development city', 'diversify', 'diversify economy', 'dubai',
       'dubai became', 'dubai development', 'early', 'early th',
       'economy', 'emirate', 'emirate gdp', 'especially',
       'especially luxury', 'estate', 'estate market', 'extending',
       'extending creek', 'extensive', 'extensive land', 'fishing',
       'fishing settlement', 'founded', 'founded early', 'free',
       'free port', 'gdp', 'helped', 'helped accelerate', 'hub', 'hub th',
       'including', 'including world', 

In [None]:
x.shape

(6, 121)

In [None]:
text_feature=pd.DataFrame(x.toarray(), columns=cv.get_feature_names_out())
text_feature

Unnamed: 0,accelerate,accelerate dubai,added,added kilometre,artificial,artificial coastline,became,became regional,building,building burj,...,started diversify,tallest,tallest building,th,th century,trade,trade hub,world,world densest,world tallest
0,0,0,0,0,0,0,1,1,0,0,...,0,0,0,2,2,1,1,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,1,...,0,1,1,0,0,0,0,2,1,1
4,0,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
(x[0].toarray()).shape

(1, 121)

In [None]:

stopword=stopwords.words('english')

punctuation=string.punctuation
def rem_stopwords_punctuation(text):
  stopword=stopwords.words('english')
  punctuation=string.punctuation
  text=text.lower()
  tokens=word_tokenize(text)
  cleaned_text=[word for word in tokens if word not in stopword]
  cleaned_text=[word for word in tokens if word not in punctuation]
  cleaned_text=' '.join(cleaned_text)
  return cleaned_text

In [None]:
new=rem_stopwords_punctuation (para)

In [None]:
new

"founded in the early 18th century as a pearling and fishing settlement dubai became a regional trade hub in the 20th century after declaring itself a free port 1901 and extending the creek 1961 11 modest oil revenue helped accelerate dubai 's development from the 1960s to the 1990s when the city started to diversify its economy 11 in 2018 oil production contributed less than 1 to the emirate 's gdp 12 rapid construction since the 1990s has produced one of the world 's densest skylines 13 including the world 's tallest building the burj khalifa extensive land-reclamation projects have added more than 300 kilometres 190 mi of artificial coastline the city has a large real estate market especially in the luxury segment"

In [None]:
def stemmer(text):
  for sent in sent_tokenize(text):
    for word in word_tokenize(sent):
      lemmatizer.lemmatize(word)

In [None]:
stemmer(new)

**TF-IDF Vectorizer**

In [None]:
np.log((5/6))

np.float64(-0.1823215567939546)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf_vect=TfidfVectorizer(ngram_range=(1,2))

In [None]:
x=tf_vect.fit_transform(corpus)
x.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.15308278, 0.15308278, 0.        , 0.        ,
        0.        , 0.        , 0.30616556, 0.15308278, 0.15308278,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.15308278, 0.15308278,
        0.15308278, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.12553007, 0.15308278, 0.        ,
        0.15308278, 0.15308278, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.15308278,
        0.15308278, 0.        , 0.        , 0.15308278, 0.15308278,
        0.15308278, 0.15308278, 0.15308278, 0.15308278, 0.        ,
        0.        , 0.        , 0.15308278, 0.15308278, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [None]:
tf_vect.vocabulary_

{'founded': 50,
 'early': 35,
 'th': 114,
 'century': 12,
 'pearling': 83,
 'fishing': 48,
 'settlement': 104,
 'dubai': 32,
 'became': 6,
 'regional': 99,
 'trade': 116,
 'hub': 57,
 'declaring': 24,
 'free': 52,
 'port': 85,
 'extending': 44,
 'creek': 23,
 'founded early': 51,
 'early th': 36,
 'th century': 115,
 'century pearling': 14,
 'pearling fishing': 84,
 'fishing settlement': 49,
 'settlement dubai': 105,
 'dubai became': 33,
 'became regional': 7,
 'regional trade': 100,
 'trade hub': 117,
 'hub th': 58,
 'century declaring': 13,
 'declaring free': 25,
 'free port': 53,
 'port extending': 86,
 'extending creek': 45,
 'modest': 76,
 'oil': 78,
 'revenue': 101,
 'helped': 55,
 'accelerate': 0,
 'development': 28,
 'city': 15,
 'started': 110,
 'diversify': 30,
 'economy': 37,
 'modest oil': 77,
 'oil revenue': 80,
 'revenue helped': 102,
 'helped accelerate': 56,
 'accelerate dubai': 1,
 'dubai development': 34,
 'development city': 29,
 'city started': 17,
 'started diversi

In [None]:
tf_vect.get_feature_names_out()

array(['accelerate', 'accelerate dubai', 'added', 'added kilometre',
       'artificial', 'artificial coastline', 'became', 'became regional',
       'building', 'building burj', 'burj', 'burj khalifa', 'century',
       'century declaring', 'century pearling', 'city', 'city large',
       'city started', 'coastline', 'construction', 'construction since',
       'contributed', 'contributed less', 'creek', 'declaring',
       'declaring free', 'densest', 'densest skyline', 'development',
       'development city', 'diversify', 'diversify economy', 'dubai',
       'dubai became', 'dubai development', 'early', 'early th',
       'economy', 'emirate', 'emirate gdp', 'especially',
       'especially luxury', 'estate', 'estate market', 'extending',
       'extending creek', 'extensive', 'extensive land', 'fishing',
       'fishing settlement', 'founded', 'founded early', 'free',
       'free port', 'gdp', 'helped', 'helped accelerate', 'hub', 'hub th',
       'including', 'including world', 