## **TEXT PREPROCESSING**

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
import gensim.downloader as api

In [None]:
# Load your annotated dataset (replace with the actual path to your CSV file)
from google.colab import drive
drive.mount('/content/drive')
data = pd.read_csv('/content/drive/MyDrive/question identification.csv')
data.head()

Mounted at /content/drive


Unnamed: 0.1,Unnamed: 0,sentence,label
0,0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,sentence
1,1,"Born and raised in Houston, Texas, she perform...",sentence
2,2,"Managed by her father, Mathew Knowles, the gro...",sentence
3,3,Their hiatus saw the release of Beyoncé's debu...,sentence
4,4,Following the disbandment of Destiny's Child i...,sentence


In [None]:
#### Text preprocessing
data['sentence'][3].lower() ####  4th row of "sentence" column in a dataset is lowercased.

'their hiatus saw the release of beyoncé\'s debut album, dangerously in love (2003), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number-one singles "crazy in love" and "baby boy".'

In [None]:
data['sentence'].str.lower() ### converts full corpus into lowercase

Unnamed: 0,sentence
0,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...
1,"born and raised in houston, texas, she perform..."
2,"managed by her father, mathew knowles, the gro..."
3,their hiatus saw the release of beyoncé's debu...
4,following the disbandment of destiny's child i...
...,...
235105,hi 11-09-teensuser3
235106,join
235107,"hi, 11-09-teensuser197."
235108,"not that i know of, 11-09-teensuser98"


In [None]:
data['sentence'] = data['sentence'].str.lower() ### Reassigning the value

In [None]:
### removal of html tags
import re
def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'', text)

In [None]:
data['sentence'].apply(remove_html_tags)

Unnamed: 0,sentence
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...
1,"Born and raised in Houston, Texas, she perform..."
2,"Managed by her father, Mathew Knowles, the gro..."
3,Their hiatus saw the release of Beyoncé's debu...
4,Following the disbandment of Destiny's Child i...
...,...
235105,hi 11-09-teensUser3
235106,JOIN
235107,"Hi, 11-09-teensUser197."
235108,"Not that I know of, 11-09-teensUser98"


In [None]:
##### Remove punctuation from the dataset
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
exclude = string.punctuation

In [None]:
def remove_punc(text):
  for char in exclude:
    text = text.replace(char,'')
  return text

In [None]:
text = 'string. with. punctuation?'

In [None]:
 print(remove_punc(text))

string with punctuation


In [None]:
data['sentence'] = data['sentence'].apply(remove_punc)

In [None]:
data['sentence']

Unnamed: 0,sentence
0,beyoncé giselle knowlescarter biːˈjɒnseɪ beeyo...
1,born and raised in houston texas she performed...
2,managed by her father mathew knowles the group...
3,their hiatus saw the release of beyoncés debut...
4,following the disbandment of destinys child in...
...,...
235105,hi 1109teensuser3
235106,join
235107,hi 1109teensuser197
235108,not that i know of 1109teensuser98


In [None]:
# Define 'res' as a dictionary containing the chat word abbreviations and their extended forms.
res = {
    'LOL': 'laughing out loud',
    'LMAO': 'laughing my ass off',
    'ROFL': 'rolling on the floor laughing',
    'IMHO': 'in my humble opinion'
}

chat = res

In [None]:
#### Removal of chat words abbreviations (lol, lmao, rofl etccc) and replace it with its extended form
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat:
            new_text.append(chat[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)


In [None]:
chat_conversion('IMHO he is the best')


'in my humble opinion he is the best'

In [None]:
data['sentence'] = data['sentence'].apply(chat_conversion)

In [None]:
data['sentence']

Unnamed: 0,sentence
0,beyoncé giselle knowlescarter biːˈjɒnseɪ beeyo...
1,born and raised in houston texas she performed...
2,managed by her father mathew knowles the group...
3,their hiatus saw the release of beyoncés debut...
4,following the disbandment of destinys child in...
...,...
235105,hi 1109teensuser3
235106,join
235107,hi 1109teensuser197
235108,not that i know of 1109teensuser98


In [None]:
data['sentence'][224736]

'laughing my ass off'

In [None]:
## Spelling correction
from textblob import TextBlob

In [None]:
incorrect_text = 'ceertain conditionas duriing seveal ggenerations aree moodified in the saame maner.'

textBlb = TextBlob(incorrect_text)

textBlb.correct().string

'certain conditions during several generations are modified in the same manner.'

In [None]:
all_text = ' '.join(data['sentence'].astype(str).tolist())  # Join all sentences into a single string
new_doc = TextBlob(all_text)                                  # Create a TextBlob object from the combined text


In [None]:
#Removal of stop words
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stopwords.words('english')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
def remove_stopwords(text):
  new_text = []

  for word in text.split():
    if word in stopwords.words('english'):
      new_text.append('')
    else:
      new_text.append(word)
  x = new_text[:]
  new_text.clear()
  return " ".join(x)

In [None]:
remove_stopwords('probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it\'s not preachy or boring. it just never gets old, despite my having seen it some 15 or more times')

'probably  all-time favorite movie,  story  selflessness, sacrifice  dedication   noble cause,    preachy  boring.   never gets old, despite   seen   15   times'

In [None]:
data['sentence'].apply(remove_stopwords)

Unnamed: 0,sentence
0,beyoncé giselle knowlescarter biːˈjɒnseɪ beeyo...
1,born raised houston texas performed variou...
2,managed father mathew knowles group became ...
3,hiatus saw release beyoncés debut album dan...
4,following disbandment destinys child june 2...
...,...
235105,hi 1109teensuser3
235106,join
235107,hi 1109teensuser197
235108,know 1109teensuser98


## **TOKENISATION**

In [None]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
sen1 = 'I am going to visit delhi!'
word_tokenize(sen1)


['I', 'am', 'going', 'to', 'visit', 'delhi', '!']

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

# Ensure all elements in 'sentence' are strings:
data['sentence'] = data['sentence'].astype(str)

# Apply tokenization:
data['tokens'] = data['sentence'].apply(lambda x: [token.text for token in nlp(x)])

# Display the result:
data[['sentence', 'tokens']].head()

Unnamed: 0,sentence,tokens
0,beyoncé giselle knowlescarter biːˈjɒnseɪ beeyo...,"[beyoncé, giselle, knowlescarter, biːˈjɒnseɪ, ..."
1,born and raised in houston texas she performed...,"[born, and, raised, in, houston, texas, she, p..."
2,managed by her father mathew knowles the group...,"[managed, by, her, father, mathew, knowles, th..."
3,their hiatus saw the release of beyoncés debut...,"[their, hiatus, saw, the, release, of, beyoncé..."
4,following the disbandment of destinys child in...,"[following, the, disbandment, of, destinys, ch..."





Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [None]:
data.to_csv('/content/drive/MyDrive/tokenized_data.csv', index=False)