<a href="https://colab.research.google.com/github/sinhajiya/NLP/blob/main/N-Gram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Loading the libraries

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from imblearn.under_sampling import RandomUnderSampler


# Question 1

## Cloning GitHub repo for dataset

In [2]:
!git clone https://github.com/islnlp/Assignment_1_2025

fatal: destination path 'Assignment_1_2025' already exists and is not an empty directory.


## Loading data and preprocessing

In [3]:
def load_data(name):
  root_fp = f"/content/Assignment_1_2025/{name}"
  train = pd.read_csv(os.path.join(root_fp, "train.csv"))
  val = pd.read_csv(os.path.join(root_fp, "val.csv"))
  train = train.dropna(subset=['Sentence'])
  val = val.dropna(subset=['Sentence'])
  return train, val

In [4]:
def preprocess_text(Sentence):

  # Preprocessing steps:
  # 1. All lower case characters
  # 2. URL removal
  # 3. Multiple dots to single dot
  # 4. Extra spaces to single space
  # 5. Removes non-alphabetic chars

    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    Sentence = Sentence.lower()
    Sentence = re.sub(url_pattern, "", Sentence)
    Sentence = re.sub(r"\.{2,}", ".", Sentence)
    Sentence = re.sub(r"\s+", " ", Sentence).strip()
    Sentence = re.sub(r"[^a-zA-Z\s]", "", Sentence)
    return Sentence

In [5]:
def load_and_preprocess_data(name):

  train, val = load_data(name)
  print(f"Loading and preprocessing the {name} data... \n")
  print(f"Train shape: {train.shape}, Val shape: {val.shape}\n")
  print(f"The dataset before preprocessing...\n")
  print(f"Train head: \n {train.head()}\n")
  print(f"Val head: \n {val.head()}\n")
  print(f"Class distribution of training data: {Counter(train['Tag'])}")
  train["Sentence_preprocessed"] = train["Sentence"].astype(str).apply(preprocess_text)
  val["Sentence_preprocessed"] = val["Sentence"].astype(str).apply(preprocess_text)
  print(f"The dataset after preprocessing...\n")
  print(f"Train head: \n {train.head()}\n")
  print(f"Val head: \n {val.head()}\n")

  return train, val

## Feature Extraction

Using N-gram language models (unigrams, bigrams, trigrams)
as features.

In [6]:
def extract_features(train, val):
  vectorizer = CountVectorizer(ngram_range=(1, 3))
  train_features = vectorizer.fit_transform(train['Sentence_preprocessed'])
  y_train = train['Tag']
  val_features = vectorizer.transform(val['Sentence_preprocessed'])
  y_val = val['Tag']
  print(f"The shape of training data: {train_features.shape}\n")
  print(f"The shape of validation data: {val_features.shape}\n")

  vocab = vectorizer.get_feature_names_out()
  random_sample = np.random.choice(vocab, 20, replace=False)
  print(f"Printing a sample of the vocabulary: \n{random_sample}\n")

  unigrams = [word for word in vocab if len(word.split()) == 1]
  bigrams = [word for word in vocab if len(word.split()) == 2]
  trigrams = [word for word in vocab if len(word.split()) == 3]

  print(f"\n Unigrams: {len(unigrams)}, Bigrams: {len(bigrams)}, Trigrams: {len(trigrams)}\n")
  print(f"Sample Unigrams: {unigrams[:5]} \n")
  print(f"Sample Bigrams: {bigrams[:5]} \n")
  print(f"Sample Trigrams: {trigrams[:5]} \n")
  print(f"The total count of the grams (features) generated: {len(unigrams) + len(bigrams) + len(trigrams)}")

  return train_features, val_features, y_train,  y_val

## Model training

In [7]:
def train_model(X, y, isclassbalanced = False):

  if not isclassbalanced:
    print(f"Class distribution before resampling: {Counter(y)}")
    undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    X_train, y_train = undersampler.fit_resample(X, y)
    print(f"Class distribution after resampling: {Counter(y_train)}")

  else:
    X_train, y_train = X, y
  nb_model = MultinomialNB()
  param_grid = {'alpha': [0.1, 0.5, 1.0, 5.0, 10.0]}
  grid_search = GridSearchCV(nb_model, param_grid, cv=5, scoring='f1_macro', verbose=2, n_jobs=-1)
  grid_search.fit(X_train, y_train)
  print(f"Best Parameters for the Naive Bayes model is {grid_search.best_params_}")
  best_nb = grid_search.best_estimator_
  return best_nb

In [8]:
def test_model(model, X_val, y_val):
  y_pred = model.predict(X_val)
  print(f"Validation Accuracy: {accuracy_score(y_val, y_pred)} \n")
  print(f"Classification Report:\n {classification_report(y_val, y_pred)}")

## Hate Dataset

In [24]:
train, val = load_and_preprocess_data('hate')

Loading and preprocessing the hate data... 

Train shape: (3660, 2), Val shape: (457, 2)

The dataset before preprocessing...

Train head: 
                                             Sentence  Tag
0  #hariyana mey ek week mey teen bachchiyo ke Sa...    1
1  indira Gandhi ko marne wala sikh.Rajiv Gandhi ...    1
2  ishliye corruption ke jariye sab ki khoon choo...    1
3  Pakistaniyon ko aisi news Maamul sa ho Gaya ha...    0
4  Apne national anthem ko change karo and yeh li...    1

Val head: 
                                             Sentence  Tag
0          Sahee bolay ho kal Pakistani ka rape hoga    0
1                    rape. Pyaar rape se kum nhi hai    0
2  ye log mandir me hi q jakar rape krte kaya bi ...    0
3  Delhi-Dehradun train mein 1seat par baithne k ...    0
4  #GadhaAkallesh ne Kairana/Sahibabad/Buxar se H...    1

Class distribution of training data: Counter({0: 2307, 1: 1353})
The dataset after preprocessing...

Train head: 
                                   

In [25]:
X_train, X_val, y_train, y_val = extract_features(train, val)

The shape of training data: (3660, 120493)

The shape of validation data: (457, 120493)

Printing a sample of the vocabulary: 
['pr mery' 'diya chosne ko' 'chutiyaape faila' 'dahej' 'fawad teri baji'
 'emaan ap' 'to rape bina' 'ga hamare chief' 'polity ka' 'andhe bhakt'
 'pe kiya' 'ki daring ct' 'jaane waali baat' 'sena tab kidhar' 'bachayi'
 'hai toh khud' 'she was' 'gupta ko bura' 'palo ma mehsoos'
 'vikas hate nai']


 Unigrams: 12910, Bigrams: 48170, Trigrams: 59413

Sample Unigrams: ['aa', 'aaa', 'aaaj', 'aaaleee', 'aaap'] 

Sample Bigrams: ['aa gae', 'aa gai', 'aa gaya', 'aa gaye', 'aa gayi'] 

Sample Trigrams: ['aa gae hai', 'aa gai ghar', 'aa gaya bharat', 'aa gaye bakwas', 'aa gaye hain'] 

The total count of the grams (features) generated: 120493


In [26]:
Counter(y_train)

Counter({1: 1353, 0: 2307})

In [31]:
1353/2307

0.5864759427828349

Since there is a class imbalance, use isclassbalanced = False:

In [12]:
model = train_model(X_train, y_train, isclassbalanced=False)
test_model(model, X_val, y_val)

Class distribution before resampling: Counter({0: 2307, 1: 1353})
Class distribution after resampling: Counter({0: 1353, 1: 1353})
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters for the Naive Bayes model is {'alpha': 1.0}
Validation Accuracy: 0.5776805251641138 

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.50      0.62       309
           1       0.41      0.73      0.53       148

    accuracy                           0.58       457
   macro avg       0.60      0.62      0.57       457
weighted avg       0.67      0.58      0.59       457



## Humor

In [36]:
train, val = load_and_preprocess_data('humor')

Loading and preprocessing the humor data... 

Train shape: (2360, 2), Val shape: (295, 2)

The dataset before preprocessing...

Train head: 
                                             Sentence  Tag
0  Jyotiraditya Scindia is like "Rassi jal gayee ...    1
1                Ishant Sharma ko bahut late utaara.    1
2           .@twinitisha neeche plug nikla hua hai..    0
3  Aaj agar India final me hota to kam se kam New...    0
4  3 stages of life of Mechanical Engineer:\n\n1)...    1

Val head: 
                                             Sentence  Tag
0  .@OfficeOfRG #HappyBdayPM wala hashtag use kar...    0
1  "Main kuch bhi dekh sakta hun bas teri ankhon ...    1
2  No rainbow pic cause waise bhi rangeeley kism ...    0
3  Hamein aur jeene ki khwahish na hoti, agar rum...    1
4  Ladkiyo ke parts pe comment karke gaali khaane...    1

Class distribution of training data: Counter({1: 1407, 0: 953})
The dataset after preprocessing...

Train head: 
                                   

In [37]:
X_train, X_val, y_train, y_val = extract_features(train, val)

The shape of training data: (2360, 54826)

The shape of validation data: (295, 54826)

Printing a sample of the vocabulary: 
['tag kar' 'kaam kar rahe' 'to kabhi cooker' 'aadhar kar do' 'ho jati'
 'se iss' 'par sense' 'khade hota' 'dono saath main' 'kaun sa'
 'wahi chale aate' 'walo chahe jitna' 'gaand lagi hai' 'ki tareef'
 'viagra kaun' 'sabha ki' 'par boner kaayam' 'bacha' 'na kare' 'kuch nami']


 Unigrams: 7160, Bigrams: 22652, Trigrams: 25014

Sample Unigrams: ['aa', 'aaaaaj', 'aaaeee', 'aabaad', 'aabadi'] 

Sample Bigrams: ['aa aa', 'aa aap', 'aa aayegi', 'aa baithenge', 'aa bhi'] 

Sample Trigrams: ['aa aa aa', 'aa aa haath', 'aa aap aapt', 'aa aayegi he', 'aa baithenge batein'] 

The total count of the grams (features) generated: 54826


In [38]:
Counter(y_train)

Counter({1: 1407, 0: 953})

In [39]:
953/1407

0.6773276474769012

In [16]:
model = train_model(X_train, y_train, isclassbalanced=False)
test_model(model, X_val, y_val)

Class distribution before resampling: Counter({1: 1407, 0: 953})
Class distribution after resampling: Counter({0: 953, 1: 953})
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters for the Naive Bayes model is {'alpha': 1.0}
Validation Accuracy: 0.6440677966101694 

Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.43      0.49       119
           1       0.67      0.79      0.73       176

    accuracy                           0.64       295
   macro avg       0.63      0.61      0.61       295
weighted avg       0.63      0.64      0.63       295



## SARCASM

In [32]:
train, val = load_and_preprocess_data('sarcasm')


Loading and preprocessing the sarcasm data... 

Train shape: (4200, 2), Val shape: (525, 2)

The dataset before preprocessing...

Train head: 
                                             Sentence  Tag
0  MashaAllah, jo log meri Black display peh chor...    1
1  Sanjeev, Suchitra Sen, and Gulzar.  #SanjeevKu...    0
2  Politics to harami hai par tu sabse bada haram...    0
3  aaj cricket khiladi neelam honge... khair khil...    0
4  #PAKvWXI  Hamay cricket fever mai Etna mast Na...    0

Val head: 
                                             Sentence  Tag
0  Aek tarf pakistan ke dushman pakistan ke khila...    0
1  Lalu ne bihar ki seva karte hue kitne ghotaale...    0
2  Politics me har admi besharam ho jata hai.... ...    0
3    Kudakudhinge dhuvasthamee?  #Maldives #Politics    0
4  Kya ram-rahim naam ke gunde ke bhakto ko manma...    0

Class distribution of training data: Counter({0: 3797, 1: 403})
The dataset after preprocessing...

Train head: 
                                 

In [33]:
X_train, X_val, y_train, y_val = extract_features(train, val)

The shape of training data: (4200, 131081)

The shape of validation data: (525, 131081)

Printing a sample of the vocabulary: 
['politics nanga bhi' 'tu chillati' 'ko desh ka' 'ko pareshaan'
 'trailer tomorrowpictwittercomrjsggufr' 'haye kyoon pighalne'
 'hai jo aap' 'ban ke baad' 'konse jhande' 'ko mila talaq'
 'parterrorist ko support' 'dekh lijiye tripletalaq' 'pehna triple talaq'
 'dudh' 'bas vote' 'paas jaana' 'ki puja' 'aata hai pr' 'aur bhi khelna'
 'ruk gya is']


 Unigrams: 14536, Bigrams: 52114, Trigrams: 64431

Sample Unigrams: ['aa', 'aaa', 'aaaaaaaaaa', 'aaaaaarceeeeeeebeeeeeee', 'aaaj'] 

Sample Bigrams: ['aa aisa', 'aa chuke', 'aa gai', 'aa gaya', 'aa gayasalman'] 

Sample Trigrams: ['aa aisa ho', 'aa chuke hai', 'aa gai excuse', 'aa gai hai', 'aa gaya aaj'] 

The total count of the grams (features) generated: 131081


In [35]:
Counter(y_train) #Highly imbalanced data

Counter({1: 403, 0: 3797})

In [19]:
model = train_model(X_train, y_train, isclassbalanced=False)
test_model(model, X_val, y_val)

Class distribution before resampling: Counter({0: 3797, 1: 403})
Class distribution after resampling: Counter({0: 403, 1: 403})
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Parameters for the Naive Bayes model is {'alpha': 0.5}
Validation Accuracy: 0.9352380952380952 

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96       474
           1       0.62      0.84      0.72        51

    accuracy                           0.94       525
   macro avg       0.80      0.89      0.84       525
weighted avg       0.95      0.94      0.94       525

