# LangQuest-2025

## 1. Regex

In [None]:
import re
import pandas as pd
import numpy as np

In [None]:
chat1 = '12345678912, abc@xyz.com, 9810235533'
chat2 = '(123)-567-8912, abc_80@xyz.com, 9810235533'
chat3 = 'yes, phone : 12345678912, email: abc@xyz.com'

In [None]:
pattern = r'(\(\d{3}\)-\d{3}-\d{4})|\d{10}'
matches = re.findall(pattern, chat3)
matches

['']

##### ? marks the preceeding character or group optional (0 or 1 occurance) and '.' matches nay single character except a newline(\n)

In [None]:
pattern = r'(\(?\d{3}\)?-?\d{3}-?\d{4})'
matches = re.findall(pattern, chat2)
matches

['(123)-567-8912', '9810235533']

##### For email id:
##### (*) means as many occurances as possible (can be zero as well)

In [None]:
pat = r'[a-zA-Z0-9_]*@[a-zA-Z]*\.[a-zA-Z]*'
matches = re.findall(pat, chat2)
matches

['abc_80@xyz.com']

##### to match non didgit or special characters(^)

In [None]:
chat1 = '#123456'
chat2 = '.123456'
chat3 = '/234567'

In [None]:
pat = r'[^\d]*\d*'
mat = re.findall(pat, chat1)
mat

['#123456', '']

In [None]:
pat = r'[^\d]*(\d*)'
mat = re.findall(pat, chat1)
mat

['123456', '']

to match exactly what you're looking for in text use parenthesis to specify() in the following way:

In [None]:
chat = 'age 50'
pat = r'age \d+'
mat = re.findall(pat, chat)
mat

['age 50']

to only extract the age (number)

In [None]:
chat = 'age 50'
pat = r'age (\d+)'
mat = re.findall(pat, chat)
mat

['50']

## 2. Tokenization using Spacy
1. Word Tokenization: breaking down a sentence into words
2. Sentence Tokenization: breaking down a paragraph into sentences

In [2]:
import spacy

##### using a blank pipeline

Blank pipeline always has a word tokenizer by default so we son't need to add any extra attributes in it

In [None]:
# word tokenization
nlp = spacy.blank("en")
doc = nlp("Dr. Sharma is a surgeon. He is really good at what he does.")
for token in doc:
    print(token)

Dr.
Sharma
is
a
surgeon
.
He
is
really
good
at
what
he
does
.


In [None]:
text = '''Dayton high school, 8th grade students information
========================================================

Name         birthday         email
-----        ---------        ------
Virat        5 June, 1882     virat@kohli.com
Maria        12 April, 2001   maria@sharapova.com
Serena       24 June, 1998    serena@williams.com
Joe          1 May, 1997      joe@root.com
'''

extracting emails without using regex using (.like_email)
> Note: REGEX can also be used but Spacy makes your work much more easier in some cases

In [None]:
doc = nlp(text)
emails = [token.text for token in doc if token.like_email]

print(emails)

['virat@kohli.com', 'maria@sharapova.com', 'serena@williams.com', 'joe@root.com']


In [None]:
for token in doc:
    print(token.text)

Dayton
high
school
,
8th
grade
students
information


=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=
=



Name
        
birthday
        
email


-----
       
---------
       
------


Virat
       
5
June
,
1882
    
virat@kohli.com


Maria
       
12
April
,
2001
  
maria@sharapova.com


Serena
      
24
June
,
1998
   
serena@williams.com


Joe
         
1
May
,
1997
     
joe@root.com




**Using a blank pipeline if we try to perform sentence tokenization in spacy that is not possible hence we can either add sentencizer in the blank pipeline as shown below or simply use a pretrained pipeline using the following command:**
> nlp = spacy.load("en_core_web")

In [3]:
nlp = spacy.blank("en")


In [5]:
import spacy

nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

text = nlp("Dr. Strange loves India. Hulk loves Delhi.")
for sent in text.sents:
    print(sent)


Dr. Strange loves India.
Hulk loves Delhi.


**cannot be used for sentence tokenization**

In [None]:
sentencizer = nlp.add_pipe("sentencizer")  # Add a simple sentence segmenter in the blank pipeline
doc = nlp("This is sentence one. This is sentence two!")

for sent in doc.sents:
    print(sent.text)

This is sentence one.
This is sentence two!


In [None]:
nlp = spacy.blank('en')
nlp.pipe_names

[]

### **Use a built existing pipeline:**


In [None]:
nlp = spacy.load(r"en_core_web_sm")
text = nlp("Dr.Strange loves India. Hulk loves Delhi.")
for sent in text.sents:
    print(sent)

Dr.Strange loves India.
Hulk loves Delhi.


In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x24681b2ecf0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x24681b2dfd0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x24680d69930>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x246819f0d10>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x246ffd5bb90>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x24680d6bbc0>)]

## 3. Stemming:
1. talking -> talk
2. eating -> eat
3. ate -> eat
4. adjustable -> adjust**

**basically uses fixed rules such as remove able, ing etc to derive a base word
LEMMATIZATION:
uses knowledge of language (linguistic knowledge) to derive a base word
ability -> ability in lemmatization but
ability -> abil in stemming**


**as you can notice words like ate and ability aren't being converted to their true base forms so the need for a more advanced technique called lemmatization arises**

In [None]:
import nltk
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ['eating', 'eats', 'eat', 'ate', 'adjustable', 'rafting', 'ability', 'meeting']
for word in words:
    print(word, " | ", stemmer.stem(word))

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  ate
adjustable  |  adjust
rafting  |  raft
ability  |  abil
meeting  |  meet


## 4. Lemmatization:
> **using the pretrained nlp pipeline for lemmatization using the command (.lemma_) all words are converted to their true base forms**

In [None]:
nlp = spacy.load(r"en_core_web_sm")

In [None]:
doc = nlp("eating eats eat ate adjustable rafting ability eating better")
for token in doc:
    print(token, " | ", token.lemma_)

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  eat
adjustable  |  adjustable
rafting  |  rafting
ability  |  ability
eating  |  eat
better  |  well


In [None]:
doc = nlp('Mando talked for 3 ghours although talking isn\'t his thing he became talkative')
for token in doc:
    print(token, "|", token.lemma_)

Mando | Mando
talked | talk
for | for
3 | 3
ghours | ghour
although | although
talking | talk
is | be
n't | not
his | his
thing | thing
he | he
became | become
talkative | talkative


## 5. Part of Speech Tagging

In [None]:
doc = nlp("Captain America ate 100$ of samosa. Then he said I can do this all day.")
for token in doc:
    print(token, ' | ', token.pos_)

Captain  |  PROPN
America  |  PROPN
ate  |  VERB
100  |  NUM
$  |  NUM
of  |  ADP
samosa  |  PROPN
.  |  PUNCT
Then  |  ADV
he  |  PRON
said  |  VERB
I  |  PRON
can  |  AUX
do  |  VERB
this  |  PRON
all  |  DET
day  |  NOUN
.  |  PUNCT


to see the tense of the word you can use tag_ fucntion

In [None]:
doc = nlp("Wow! Dr. Strange made 265 million $ on the very first day")
for token in doc:
    print(token, "|", token.pos_, '|', spacy.explain(token.pos_),
          "|", token.tag_, "|", spacy.explain(token.tag_))

Wow | INTJ | interjection | UH | interjection
! | PUNCT | punctuation | . | punctuation mark, sentence closer
Dr. | PROPN | proper noun | NNP | noun, proper singular
Strange | PROPN | proper noun | NNP | noun, proper singular
made | VERB | verb | VBD | verb, past tense
265 | NUM | numeral | CD | cardinal number
million | NUM | numeral | CD | cardinal number
$ | NUM | numeral | CD | cardinal number
on | ADP | adposition | IN | conjunction, subordinating or preposition
the | DET | determiner | DT | determiner
very | ADV | adverb | RB | adverb
first | ADJ | adjective | JJ | adjective (English), other noun-modifier (Chinese)
day | NOUN | noun | NN | noun, singular or mass


**Word2VEC Textual Representation**


In [11]:
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [14]:
import spacy

nlp = spacy.load("en_core_web_md")

sentence = "TensorQuest is an exciting machine learning competition"

doc = nlp(sentence.lower())

word_vectors = {token.text: token.vector for token in doc}

print("Vector representation of 'machine':\n", word_vectors['machine'])


Vector representation of 'machine':
 [-0.72883    0.20718   -0.0033379 -0.0027673 -0.17204    0.023277
  0.1297    -0.2112     0.32876    0.67447    0.10047   -0.30559
  0.11213    0.22959   -0.32997    0.1389    -0.57289    2.523
 -0.32921    0.06045    0.23895    0.1091     0.19358   -0.1765
  0.11583    0.63204   -0.13644   -0.24354    0.20061   -0.50244
  0.40537   -0.38688    0.73784    0.093937  -0.30643    0.045874
  0.097915  -0.082114   0.13082   -0.039022   0.088084  -0.27023
 -0.077658  -0.0045355  0.18986   -0.063083  -0.138      0.40474
 -0.16199   -0.10953    0.22923   -0.67634   -0.65763   -0.044595
 -0.12119    0.071167   0.25993   -0.27052   -0.22474   -0.13818
  0.20692    0.87604   -0.35257   -0.1498     0.72804    0.68768
  0.19993    0.084733  -0.2234     0.11301    0.29895   -0.090119
  0.038172  -0.32912    0.014221  -0.36335    0.5898     0.10467
  0.16549    0.47199    0.078939  -0.19985    0.84014   -0.2277
 -0.22907   -0.26243   -0.32598    1.0146    -0.07923

## 6. Named Entity Recognititon (NER)

**identifies and classifies entities like people, organizations, locations, dates, and more, within unstructured text.**


In [None]:
doc = nlp("Tesla Inc. is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, ' | ', ent.label_ , " | ", spacy.explain(ent.label_))

Tesla Inc.  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


# Text Classification Model

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv(r'C:\Users\Maanya Verma\Downloads\bbc-text.csv\bbc-text.csv')
df

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


encoding the category column

In [None]:
df["category_encoded"], category_mapping = pd.factorize(df["category"])
print(category_mapping)


Index(['tech', 'business', 'sport', 'entertainment', 'politics'], dtype='object')


In [None]:
df

Unnamed: 0,category,text,category_encoded
0,tech,tv future in the hands of viewers with home th...,0
1,business,worldcom boss left books alone former worldc...,1
2,sport,tigers wary of farrell gamble leicester say ...,2
3,sport,yeading face newcastle in fa cup premiership s...,2
4,entertainment,ocean s twelve raids box office ocean s twelve...,3
...,...,...,...
2220,business,cars pull down us retail figures us retail sal...,1
2221,politics,kilroy unveils immigration policy ex-chatshow ...,4
2222,entertainment,rem announce new glasgow concert us band rem h...,3
2223,politics,how political squabbles snowball it s become c...,4


In [None]:
df.shape

(2225, 3)

**We will now take a look at how the Product categories are mapped and then also create couple dictionaries from the same for future reference**

In [None]:
category_id_df = df[['category', 'category_encoded']].drop_duplicates().sort_values(by = 'category_encoded').reset_index(drop = 1)
category_id_df

Unnamed: 0,category,category_encoded
0,tech,0
1,business,1
2,sport,2
3,entertainment,3
4,politics,4


## Preprocessing

**When it comes to NLP, getting rid of stopwords is one of the most important steps. It ensures we get rid of the most frequent but usually useless words, e.g. "the", "a", "an", etc. to eliminate any bias they might cause.
We have many methods to eliminate stop words - many NLP libraries like sklearn have their own stop words but it is usually considered a good idea to use stop words from the NLTK library. We shall do the same.**

In [None]:
import spacy
nlp = spacy.load(r"en_core_web_sm")

In [None]:
def preprocess_text(text):
    doc = nlp(text.lower())  # Convert to lowercase and process with spaCy
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)  # Return cleaned text

df["clean_text"] = df["text"].apply(preprocess_text)

What this does:
1. text.lower()
Converts all text to lowercase to ensure consistency (e.g., "Apple" and "apple" are treated the same).

2. doc = nlp(text.lower())
Passes the text through spaCy's NLP pipeline, which performs:

Tokenization → Splits text into words.

POS tagging → Identifies parts of speech (nouns, verbs, etc.).

Lemmatization → Converts words to their base form (e.g., "running" → "run").

Stopword detection → Identifies common words like "the," "is," "and."

3. token.lemma_ → Gets the lemma (base form) of each word.

if not token.is_stop → Removes stopwords (e.g., "the," "and," "is").

if not token.is_punct → Removes punctuation marks.



In [None]:
df

Unnamed: 0,category,text,category_encoded,clean_text
0,tech,tv future in the hands of viewers with home th...,0,tv future hand viewer home theatre system pl...
1,business,worldcom boss left books alone former worldc...,1,worldcom boss leave book worldcom boss ber...
2,sport,tigers wary of farrell gamble leicester say ...,2,tiger wary farrell gamble leicester rush m...
3,sport,yeading face newcastle in fa cup premiership s...,2,yeade face newcastle fa cup premiership newcas...
4,entertainment,ocean s twelve raids box office ocean s twelve...,3,ocean s raid box office ocean s crime caper ...
...,...,...,...,...
2220,business,cars pull down us retail figures us retail sal...,1,car pull retail figure retail sale fall 0.3 ja...
2221,politics,kilroy unveils immigration policy ex-chatshow ...,4,kilroy unveil immigration policy ex chatshow h...
2222,entertainment,rem announce new glasgow concert us band rem h...,3,rem announce new glasgow concert band rem anno...
2223,politics,how political squabbles snowball it s become c...,4,political squabble snowball s commonplace argu...


## Feature Engineering

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["clean_text"])  # Convert text into TF-IDF features
y = df["category"]  # Target labels


In [None]:
X.shape

(2225, 23352)

In [None]:
X[:5, :5]

<5x5 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [None]:
print(X)
print(y)

  (0, 14425)	0.041945871300314255
  (0, 8852)	0.02676235457041553
  (0, 10984)	0.023134929761185002
  (0, 17433)	0.033190033434919185
  (0, 14137)	0.03020988013540774
  (0, 15835)	0.03008699910638888
  (0, 22904)	0.03537450107129553
  (0, 21145)	0.0625378658149778
  (0, 4047)	0.021022622725503065
  (0, 15733)	0.040430913842023644
  (0, 19740)	0.03209161559275845
  (0, 12057)	0.04583939162678847
  (0, 16535)	0.03039790812720782
  (0, 2014)	0.024801402350642918
  (0, 9118)	0.04457129151119338
  (0, 3188)	0.02725532589810651
  (0, 4590)	0.021975115945528475
  (0, 13909)	0.033190033434919185
  (0, 1107)	0.03479625256605947
  (0, 17872)	0.05538824708072302
  (0, 8958)	0.040204730350824294
  (0, 17665)	0.04539339686530647
  (0, 11220)	0.04681601260588735
  (0, 17356)	0.04140753482112253
  (0, 66)	0.0262612629546789
  :	:
  (2224, 16224)	0.051274360662524915
  (2224, 22899)	0.08260332559989264
  (2224, 2920)	0.12973302469506673
  (2224, 21900)	0.06912933786675873
  (2224, 5850)	0.140220039831

## Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Model Selection & Training (Naive Bayes)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9595505617977528
Classification Report:
                precision    recall  f1-score   support

     business       0.94      0.95      0.95       101
entertainment       1.00      0.89      0.94        81
     politics       0.92      0.98      0.95        83
        sport       0.99      1.00      0.99        98
         tech       0.95      0.98      0.96        82

     accuracy                           0.96       445
    macro avg       0.96      0.96      0.96       445
 weighted avg       0.96      0.96      0.96       445



# Sentiment Analysis

## Importing necessary libraries and loading the dataset

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification

In [None]:
df = pd.read_csv("/kaggle/input/sentimenttest1/sentimenttest(1).csv.xls")  # Load review dataset
df

Unnamed: 0,review
0,I love this product! It works perfectly.
1,Absolutely terrible experience. Do not buy!
2,Great quality and fast shipping. Highly recomm...
3,The material feels cheap and broke in a week.
4,Decent product for the price. Not the best but...
5,Amazing! I'm really happy with this purchase.
6,Horrible customer service. Took ages to respond.
7,Super comfortable and looks great!
8,The battery dies too quickly. Expected better.
9,Very satisfied. Would buy again.


##  Load a Pre-trained Sentiment Analysis Model

* We use "nlptown/bert-base-multilingual-uncased-sentiment", a BERT-based model that predicts sentiment on a scale of 1 to 5.
* AutoTokenizer.from_pretrained(model_name): Loads the corresponding tokenizer to process text into tokenized input.
* AutoModelForSequenceClassification.from_pretrained(model_name): Loads the pre-trained BERT model fine-tuned for sentiment classification.

In [None]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

## Move Model to GPU (If Available)

* Checks if a GPU (CUDA) is available. If yes, it uses GPU; otherwise, it falls back to CPU.
* Moves the model to the selected device to optimize computation speed.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

##  Tokenize the Review Text

Key Parameters:

* .tolist(): Converts the Pandas column (df["review"]) to a Python list for processing.
* add_special_tokens=True: Adds [CLS] (start) and [SEP] (end) tokens required by BERT.
* max_length=128: Limits the review length to 128 tokens (truncates longer texts and pads shorter ones).
* padding="max_length": Ensures all inputs are padded to the same length (128 tokens).
* return_attention_mask=True: Generates an attention mask to differentiate real tokens from padding.
* return_tensors="pt": Converts everything into PyTorch tensors for processing.
* truncation=True: Cuts off longer texts beyond 128 tokens.

In [None]:
encodings = tokenizer(
    df["review"].tolist(),   # Convert DataFrame column to list
    add_special_tokens=True, # Add [CLS] and [SEP] tokens
    max_length=128,          # Truncate or pad to 128 tokens
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt",
    truncation=True
)

In [None]:
input_ids = encodings["input_ids"].to(device)
attention_mask = encodings["attention_mask"].to(device)


## Get Model Predictions

* torch.no_grad(): Disables gradient computation (saves memory & speeds up inference).
* outputs = model(...): Passes tokenized input through the BERT model.
* logits = outputs.logits: Extracts logits (raw model outputs before applying softmax).

In [None]:
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits


In [None]:
df["predicted_sentiment"] = logits.argmax(axis=1).cpu().numpy() + 1


In [None]:
df = df[["review", "predicted_sentiment"]]


In [None]:
df.to_csv("predicted_reviews.csv", index=False)
print(df.head(10))

                                              review  predicted_sentiment
0           I love this product! It works perfectly.                    5
1        Absolutely terrible experience. Do not buy!                    1
2  Great quality and fast shipping. Highly recomm...                    5
3      The material feels cheap and broke in a week.                    2
4  Decent product for the price. Not the best but...                    3
5      Amazing! I'm really happy with this purchase.                    5
6   Horrible customer service. Took ages to respond.                    1
7                 Super comfortable and looks great!                    5
8     The battery dies too quickly. Expected better.                    2
9                   Very satisfied. Would buy again.                    5
