# import files

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np


# dataset Load

In [2]:
# Replace 'your_dataset.xlsx' with the actual file name
file_path = 'Cooking and Makeup Video.xlsx'
df = pd.read_excel(file_path)

# Display the first few rows to verify the data
df.head()


Unnamed: 0,Comment,React,Type
0,খুব সুন্দর হয়েছে আপু,1,Positive
1,আমার অনেক কষ্ট হয় মেকআপ করতে অনেক সুন্দর লাগছ...,0,Positive
2,"আপু মেকআপ খুব সুন্দর হয়েছে, আমার কাছে অনেক ভাল...",2,Positive
3,আপু পেছনের দেয়ালের রং টার জন্য লাইট টা বেশি ফ...,8,Positive
4,মাস-আল্লাহ খুব সুন্দর হয়েছে কিউট আপু,0,Positive


# Preprossing

In [3]:
def text_to_word_list(text):
    text = text.split()
    return text

def replace_strings(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\u00C0-\u017F"          #latin
                           u"\u2000-\u206F"          #generalPunctuations
                               
                           "]+", flags=re.UNICODE)
    english_pattern=re.compile('[a-zA-Z0-9]+', flags=re.I)
    #latin_pattern=re.compile('[A-Za-z\u00C0-\u00D6\u00D8-\u00f6\u00f8-\u00ff\s]*',)
    
    text=emoji_pattern.sub(r'', text)
    text=english_pattern.sub(r'', text)

    return text

def remove_punctuations(my_str):
    # define punctuation
    punctuations = '''````£|¢|Ñ+-*/=EROero৳০১২৩৪৫৬৭৮৯012–34567•89।!()-[]{};:'"“\’,<>./?@#$%^&*_~‘—॥”‰🤣⚽️✌�￰৷￰'''
    
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char

    # display the unpunctuated string
    return no_punct



def joining(text):
    out=' '.join(text)
    return out

def preprocessing(text):
    out=remove_punctuations(replace_strings(text))
    return out

# stop world remouve

In [4]:
# Read stop words from a text file
# Ensure each stop word is on a new line in the text file
file_path = "stopwold.txt"

with open(file_path, 'r', encoding='utf-8') as file:
    stop = [line.strip() for line in file]

# Display the first few stop words to verify
print("First few stop words:")
print(stop[:10])


First few stop words:
['আপু', 'না', 'ভালো', 'খুব', 'অনেক', 'কি', 'আমি', 'আমার', 'সুন্দর', 'করে']


In [5]:
# Define the stopword removal function
def stopword_removal(text):    
    words = str(text).split()  # Split the text into words
    filtered_words = [word for word in words if word not in stop]  # Remove stop words
    return ' '.join(filtered_words)  # Join the remaining words back into a single string

# Apply stop word removal to the 'Comments' column
df['cleanComments'] = df['Comment'].apply(stopword_removal)

# Display the first few rows to verify
df.head()

Unnamed: 0,Comment,React,Type,cleanComments
0,খুব সুন্দর হয়েছে আপু,1,Positive,
1,আমার অনেক কষ্ট হয় মেকআপ করতে অনেক সুন্দর লাগছ...,0,Positive,আপু।
2,"আপু মেকআপ খুব সুন্দর হয়েছে, আমার কাছে অনেক ভাল...",2,Positive,"হয়েছে, লেগেছে,আপনাকে আপু।"
3,আপু পেছনের দেয়ালের রং টার জন্য লাইট টা বেশি ফ...,8,Positive,পেছনের দেয়ালের পড়েছে। সবসময়ের ️️️
4,মাস-আল্লাহ খুব সুন্দর হয়েছে কিউট আপু,0,Positive,মাস-আল্লাহ


# Stemming

In [6]:
# Import necessary libraries
import logging
from bangla_stemmer.stemmer import stemmer

# Suppress logs from the Bangla Stemmer library
logging.getLogger('bangla_stemmer').setLevel(logging.CRITICAL)

# Define the stemming function
def stem_text(text):
    stmr = stemmer.BanglaStemmer()  # Initialize the stemmer
    words = str(text).split()  # Split the text into words
    stemmed_words = stmr.stem(words)  # Perform stemming
    return ' '.join(stemmed_words)  # Join the stemmed words back into a single string

# Apply stemming to the 'Comments' column
df['stemmedComments'] = df['cleanComments'].apply(stem_text)

# Display the first few rows to verify the result
df.head()

applied second rules..
applied fourth rules..
applied first rules..
applied second rules..
applied fourth rules..
applied second rules..
applied fourth rules..
applied fourth rules..
applied second rules..
applied fourth rules..
applied second rules..
applied fourth rules..
applied fourth rules..
applied first rules..
applied first rules..
applied fourth rules..
applied second rules..
applied fourth rules..
applied first rules..
applied fourth rules..
applied first rules..
applied fourth rules..
applied second rules..
applied fourth rules..
applied first rules..
applied second rules..
applied first rules..
applied first rules..
applied fourth rules..
applied first rules..
applied second rules..
applied fourth rules..
applied first rules..
applied first rules..
applied first rules..
applied first rules..
applied first rules..
applied fourth rules..
applied second rules..
applied fourth rules..
applied first rules..
applied second rules..
applied fourth rules..
applied fourth rules..
app

Unnamed: 0,Comment,React,Type,cleanComments,stemmedComments
0,খুব সুন্দর হয়েছে আপু,1,Positive,,
1,আমার অনেক কষ্ট হয় মেকআপ করতে অনেক সুন্দর লাগছ...,0,Positive,আপু।,আপু।
2,"আপু মেকআপ খুব সুন্দর হয়েছে, আমার কাছে অনেক ভাল...",2,Positive,"হয়েছে, লেগেছে,আপনাকে আপু।","হয়েছে, লেগেছে,আপনা আপু।"
3,আপু পেছনের দেয়ালের রং টার জন্য লাইট টা বেশি ফ...,8,Positive,পেছনের দেয়ালের পড়েছে। সবসময়ের ️️️,পেছন দেয়াল পড়েছে। সবসময় ️️️
4,মাস-আল্লাহ খুব সুন্দর হয়েছে কিউট আপু,0,Positive,মাস-আল্লাহ,মাস-আল্লাহ


# Lemmatization

In [7]:
# Import necessary modules
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

# Download WordNet data (if not already downloaded)
nltk.download('wordnet')
nltk.download('omw-1.4')  # Download multilingual WordNet support (optional)

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the lemmatization function
def lemmatize_text_wordnet(text):
    # Split the text into words, lemmatize each word, and rejoin
    words = str(text).split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Apply lemmatization to the stemmed comments
df['lemmatizedComments'] = df['stemmedComments'].apply(lemmatize_text_wordnet)

# Display the first few rows to verify the result
df.head()



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,Comment,React,Type,cleanComments,stemmedComments,lemmatizedComments
0,খুব সুন্দর হয়েছে আপু,1,Positive,,,
1,আমার অনেক কষ্ট হয় মেকআপ করতে অনেক সুন্দর লাগছ...,0,Positive,আপু।,আপু।,আপু।
2,"আপু মেকআপ খুব সুন্দর হয়েছে, আমার কাছে অনেক ভাল...",2,Positive,"হয়েছে, লেগেছে,আপনাকে আপু।","হয়েছে, লেগেছে,আপনা আপু।","হয়েছে, লেগেছে,আপনা আপু।"
3,আপু পেছনের দেয়ালের রং টার জন্য লাইট টা বেশি ফ...,8,Positive,পেছনের দেয়ালের পড়েছে। সবসময়ের ️️️,পেছন দেয়াল পড়েছে। সবসময় ️️️,পেছন দেয়াল পড়েছে। সবসময় ️️️
4,মাস-আল্লাহ খুব সুন্দর হয়েছে কিউট আপু,0,Positive,মাস-আল্লাহ,মাস-আল্লাহ,মাস-আল্লাহ


# Changing Labels to Numbers

In [8]:
# Find unique labels in the 'Prediction' column
unique_labels = df['Type'].unique()
print("Unique labels in 'Prediction' column:")
print(unique_labels)
# Create a mapping of labels to numbers
label_to_number = {label: idx for idx, label in enumerate(unique_labels)}

# Print the mapping
print("\nMapping of labels to numbers:")
print(label_to_number)

# Replace labels in the 'Prediction' column with corresponding numbers
df['PredictionNumeric'] = df['Type'].map(label_to_number)
df.head()

Unique labels in 'Prediction' column:
['Positive' 'Neutral' 'Negative' 'neutral']

Mapping of labels to numbers:
{'Positive': 0, 'Neutral': 1, 'Negative': 2, 'neutral': 3}


Unnamed: 0,Comment,React,Type,cleanComments,stemmedComments,lemmatizedComments,PredictionNumeric
0,খুব সুন্দর হয়েছে আপু,1,Positive,,,,0
1,আমার অনেক কষ্ট হয় মেকআপ করতে অনেক সুন্দর লাগছ...,0,Positive,আপু।,আপু।,আপু।,0
2,"আপু মেকআপ খুব সুন্দর হয়েছে, আমার কাছে অনেক ভাল...",2,Positive,"হয়েছে, লেগেছে,আপনাকে আপু।","হয়েছে, লেগেছে,আপনা আপু।","হয়েছে, লেগেছে,আপনা আপু।",0
3,আপু পেছনের দেয়ালের রং টার জন্য লাইট টা বেশি ফ...,8,Positive,পেছনের দেয়ালের পড়েছে। সবসময়ের ️️️,পেছন দেয়াল পড়েছে। সবসময় ️️️,পেছন দেয়াল পড়েছে। সবসময় ️️️,0
4,মাস-আল্লাহ খুব সুন্দর হয়েছে কিউট আপু,0,Positive,মাস-আল্লাহ,মাস-আল্লাহ,মাস-আল্লাহ,0


# Removing Null values

In [9]:
# Display the count of non-null values before dropping nulls
print("Non-null values before dropping:")
print(df.info())

# Drop rows with null values in the 'Comments' or 'Prediction' columns
df = df.dropna(subset=['lemmatizedComments', 'Type'])

# Display the count of non-null values after dropping nulls
print("\nNon-null values after dropping:")
print(df.info())

# Display the first few rows to verify
df.head()

Non-null values before dropping:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Comment             29995 non-null  object
 1   React               29980 non-null  object
 2   Type                30000 non-null  object
 3   cleanComments       30000 non-null  object
 4   stemmedComments     30000 non-null  object
 5   lemmatizedComments  30000 non-null  object
 6   PredictionNumeric   30000 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 1.6+ MB
None

Non-null values after dropping:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Comment             29995 non-null  object
 1   React               29980 non-null  object
 2   Type                30000 non-null

Unnamed: 0,Comment,React,Type,cleanComments,stemmedComments,lemmatizedComments,PredictionNumeric
0,খুব সুন্দর হয়েছে আপু,1,Positive,,,,0
1,আমার অনেক কষ্ট হয় মেকআপ করতে অনেক সুন্দর লাগছ...,0,Positive,আপু।,আপু।,আপু।,0
2,"আপু মেকআপ খুব সুন্দর হয়েছে, আমার কাছে অনেক ভাল...",2,Positive,"হয়েছে, লেগেছে,আপনাকে আপু।","হয়েছে, লেগেছে,আপনা আপু।","হয়েছে, লেগেছে,আপনা আপু।",0
3,আপু পেছনের দেয়ালের রং টার জন্য লাইট টা বেশি ফ...,8,Positive,পেছনের দেয়ালের পড়েছে। সবসময়ের ️️️,পেছন দেয়াল পড়েছে। সবসময় ️️️,পেছন দেয়াল পড়েছে। সবসময় ️️️,0
4,মাস-আল্লাহ খুব সুন্দর হয়েছে কিউট আপু,0,Positive,মাস-আল্লাহ,মাস-আল্লাহ,মাস-আল্লাহ,0


# Feature Extraction with TFIDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF transformer
max_features_values = [10000]  # Different max_features configurations
tfidf_transformers = {features: TfidfVectorizer(ngram_range=(1, 3), 
                                                lowercase=True, 
                                                max_features=features) 
                      for features in max_features_values}

# Apply TF-IDF transformation for each configuration
tfidf_results = {}

for features, transformer in tfidf_transformers.items():
    print(f"\nApplying TF-IDF with max_features={features}...")
    # Fit-transform on the entire dataset
    tfidf_matrix = transformer.fit_transform(df['lemmatizedComments'].values)
    tfidf_results[features] = tfidf_matrix  # Store the result
    
    # Print a summary
    print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Access one of the TF-IDF matrices
# Example: For max_features=10000
tfidf_10000 = tfidf_results[10000]

# Display the TF-IDF feature names (optional, only for smaller feature sizes)
print("\nSample TF-IDF feature names (for max_features=10000):")
print(tfidf_transformers[10000].get_feature_names_out()[:10])


Applying TF-IDF with max_features=10000...
TF-IDF matrix shape: (30000, 10000)

Sample TF-IDF feature names (for max_features=10000):
['00' '00 ১ঘন' '01' '01868158055' '04' '05' '05 0_x000d_'
 '05 0_x000d_ 1_x000d_' '0_x000d_' '0_x000d_ 0_x000d_']


# DATASET Splitting (80% for train, 20% to test)

In [11]:
from sklearn.model_selection import train_test_split

# Split the dataset into 80% training and 20% testing
X = df['lemmatizedComments']  # Features: Comments
y = df['PredictionNumeric']  # Target: Numeric Prediction

# Perform the split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the splits to verify
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

Training set size: 24000
Testing set size: 6000


# Applying LPBoost Algorithm

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X = df['lemmatizedComments']
y = df['PredictionNumeric']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply TF-IDF and train LPBoost in one step
tfidf_transformer = TfidfVectorizer(ngram_range=(1, 3), lowercase=True, max_features=10000)
X_train_tfidf = tfidf_transformer.fit_transform(X_train)
X_test_tfidf = tfidf_transformer.transform(X_test)

# Convert TF-IDF sparse matrix to dense format
X_train_dense = X_train_tfidf.toarray()
X_test_dense = X_test_tfidf.toarray()

# LPBoost-like approach using AdaBoost with a DecisionTreeClassifier as the base estimator
weak_learner = DecisionTreeClassifier(max_depth=1)  # Weak learner for boosting
lpboost_model = AdaBoostClassifier(
    estimator=weak_learner,  # Updated parameter
    n_estimators=100,  # Number of boosting rounds
    random_state=42
)
lpboost_model.fit(X_train_dense, y_train)

# Predict and evaluate
y_pred = lpboost_model.predict(X_test_dense)
print("Accuracy:", accuracy_score(y_test, y_pred))


