<a href="https://colab.research.google.com/github/poojakanchala/Data-Science/blob/main/NLP_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd

# Sample data with special characters and reviews
data = {
    "review": [
        "Absolutely loved this product! üòç Works perfectly & exceeded expectations!",
        "Worst purchase ever... üò° Totally disappointed & waste of money!!!",
        "Great value for the price üëç but packaging was a bit damaged üòï"
    ],
    "sentiment": ["positive", "negative", "positive"]
}

# Create dataframe
df = pd.DataFrame(data)
df

Unnamed: 0,review,sentiment
0,Absolutely loved this product! üòç Works perfect...,positive
1,Worst purchase ever... üò° Totally disappointed ...,negative
2,Great value for the price üëç but packaging was ...,positive


**Data Preprocessing Pipeline**

In [18]:
####Preprocessing

## step1: converting to lower cases
df['lowercase'] = df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment,lowercase
0,Absolutely loved this product! üòç Works perfect...,positive,absolutely loved this product! üòç works perfect...
1,Worst purchase ever... üò° Totally disappointed ...,negative,worst purchase ever... üò° totally disappointed ...
2,Great value for the price üëç but packaging was ...,positive,great value for the price üëç but packaging was ...


In [19]:
## step2: cleaning all the special chars
import re
df['clean_specials'] = df['lowercase'].apply(lambda x: re.sub(r'[^a-zA-Z\s]','',x))
df

Unnamed: 0,review,sentiment,lowercase,clean_specials
0,Absolutely loved this product! üòç Works perfect...,positive,absolutely loved this product! üòç works perfect...,absolutely loved this product works perfectly...
1,Worst purchase ever... üò° Totally disappointed ...,negative,worst purchase ever... üò° totally disappointed ...,worst purchase ever totally disappointed was...
2,Great value for the price üëç but packaging was ...,positive,great value for the price üëç but packaging was ...,great value for the price but packaging was a...


In [20]:
## step3: converting text to tokens
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')     ## supporting package to run word_tokenize
df['token'] = df['clean_specials'].apply(word_tokenize)
df.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,review,sentiment,lowercase,clean_specials,token
0,Absolutely loved this product! üòç Works perfect...,positive,absolutely loved this product! üòç works perfect...,absolutely loved this product works perfectly...,"[absolutely, loved, this, product, works, perf..."
1,Worst purchase ever... üò° Totally disappointed ...,negative,worst purchase ever... üò° totally disappointed ...,worst purchase ever totally disappointed was...,"[worst, purchase, ever, totally, disappointed,..."
2,Great value for the price üëç but packaging was ...,positive,great value for the price üëç but packaging was ...,great value for the price but packaging was a...,"[great, value, for, the, price, but, packaging..."


In [27]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [28]:
# step4: removing stop words(most repeated once)
from nltk.corpus import stopwords
stop_words  = set(stopwords.words('english'))
df['no_stopwords']= df['token'].apply(lambda x: [word for word in x if word not in stop_words])
df

Unnamed: 0,review,sentiment,lowercase,clean_specials,token,no_stopwords
0,Absolutely loved this product! üòç Works perfect...,positive,absolutely loved this product! üòç works perfect...,absolutely loved this product works perfectly...,"[absolutely, loved, this, product, works, perf...","[absolutely, loved, product, works, perfectly,..."
1,Worst purchase ever... üò° Totally disappointed ...,negative,worst purchase ever... üò° totally disappointed ...,worst purchase ever totally disappointed was...,"[worst, purchase, ever, totally, disappointed,...","[worst, purchase, ever, totally, disappointed,..."
2,Great value for the price üëç but packaging was ...,positive,great value for the price üëç but packaging was ...,great value for the price but packaging was a...,"[great, value, for, the, price, but, packaging...","[great, value, price, packaging, bit, damaged]"


In [38]:
#step5:
#emoji
import emoji
df['emoji_to_text'] = df['review'].apply(lambda x: emoji.demojize(x))
# emoticons
emoticon_dict = { ":-)": "smile",":-D":"laugh",":-(": "sad"}
def replace_emoticons(text):
    for k,v in emoticon_dict.items():
        text = text.replace(k,v)
    return text
df['emoji_to_text'] = df['emoji_to_text'].apply(replace_emoticons)

#lemma
lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['no_stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# stemming
stmmer = PorterStemmer()
df['stemmed'] = df['lemmatized'].apply(lambda x: [stmmer.stem(word) for word in x])
df



Unnamed: 0,review,sentiment,lowercase,clean_specials,token,no_stopwords,stemmed,lemmatized,emoticons,emojis,emoji_to_text
0,Absolutely loved this product! üòç Works perfect...,positive,absolutely loved this product! üòç works perfect...,absolutely loved this product works perfectly...,"[absolutely, loved, this, product, works, perf...","[absolutely, loved, product, works, perfectly,...","[absolut, love, product, work, perfectli, exce...","[absolutely, loved, product, work, perfectly, ...",Absolutely loved this product! :smiling_face_w...,"[Absolutely, loved, this, product!, :smiling_f...",Absolutely loved this product! :smiling_face_w...
1,Worst purchase ever... üò° Totally disappointed ...,negative,worst purchase ever... üò° totally disappointed ...,worst purchase ever totally disappointed was...,"[worst, purchase, ever, totally, disappointed,...","[worst, purchase, ever, totally, disappointed,...","[worst, purchas, ever, total, disappoint, wast...","[worst, purchase, ever, totally, disappointed,...",Worst purchase ever... :enraged_face: Totally ...,"[Worst, purchase, ever..., :enraged_face:, Tot...",Worst purchase ever... :enraged_face: Totally ...
2,Great value for the price üëç but packaging was ...,positive,great value for the price üëç but packaging was ...,great value for the price but packaging was a...,"[great, value, for, the, price, but, packaging...","[great, value, price, packaging, bit, damaged]","[great, valu, price, packag, bit, damag]","[great, value, price, packaging, bit, damaged]",Great value for the price :thumbs_up: but pack...,"[Great, value, for, the, price, :thumbs_up:, b...",Great value for the price :thumbs_up: but pack...


In [39]:
df.columns

Index(['review', 'sentiment', 'lowercase', 'clean_specials', 'token',
       'no_stopwords', 'stemmed', 'lemmatized', 'emoticons', 'emojis',
       'emoji_to_text'],
      dtype='object')

**Covert into numeric data**   

this will be done in two ways:
1. Tfidf vectorizer :
          Tfidf=tf*log(N/df)      
               tf - termfreq
               N  - Total documents
               df - no.of docs the word is present

      Note: this gives sparce matrix(a matrix which contains more number of zeros) as output

2. Countvectorizer:
   The value in each cell is the count of that word in that document.

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
tf.fit(df['clean_specials'])
tf.transform(df['clean_specials']).toarray()


array([[0.35355339, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.35355339, 0.35355339, 0.        , 0.        ,
        0.35355339, 0.        , 0.        , 0.        , 0.35355339,
        0.        , 0.35355339, 0.        , 0.        , 0.35355339,
        0.        , 0.        , 0.        , 0.        , 0.35355339,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.35355339,
        0.35355339, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.35355339, 0.35355339, 0.        , 0.        ,
        0.        , 0.        , 0.35355339, 0.        , 0.        ,
        0.35355339, 0.        , 0.        , 0.35355339, 0.        ,
        0.35355339],
       [0.        , 0.31622777, 0.31622777, 0.31622777, 0.        ,
        0.        , 0.        , 0.        , 0.31622777, 0.31622777,
        0.        , 0.        , 0.        , 0.31622777, 0.        ,
        0.31622777, 0.        , 0.        , 0.31622777, 0.        ,
      

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(df['clean_specials'])
cv.transform(df['clean_specials']).toarray()

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 1, 0],
       [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 1, 0, 1],
       [0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
        1, 0, 0, 0]])

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(ngram_range=(1,2))       ## it will give the similar word contexts
tf.fit(df['clean_specials'])
tf.transform(df['clean_specials']).toarray()
tf.get_feature_names_out()

array(['absolutely', 'absolutely loved', 'bit', 'bit damaged', 'but',
       'but packaging', 'damaged', 'disappointed', 'disappointed waste',
       'ever', 'ever totally', 'exceeded', 'exceeded expectations',
       'expectations', 'for', 'for the', 'great', 'great value', 'loved',
       'loved this', 'money', 'of', 'of money', 'packaging',
       'packaging was', 'perfectly', 'perfectly exceeded', 'price',
       'price but', 'product', 'product works', 'purchase',
       'purchase ever', 'the', 'the price', 'this', 'this product',
       'totally', 'totally disappointed', 'value', 'value for', 'was',
       'was bit', 'waste', 'waste of', 'works', 'works perfectly',
       'worst', 'worst purchase'], dtype=object)