## NLP

In [1]:
#import necessary libraries 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#import data
data = pd.read_csv('twitter_training.csv')
data.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


### ABOUT DATASET
#### The twitter sentiment dataset from kaggle is a data containing posts of people's review over a video game  "Borderlands" 
Each row in the dataset typically includes:

ID: A unique identifier for the tweet.

Topic: The subject of the tweet (e.g., "Borderlands").

Sentiment: The sentiment expressed in the tweet (e.g., Positive, Negative, Neutral).

Tweet: The text content of the tweet.

In [3]:
data.shape

(74681, 4)

In [4]:
data[data['Positive'] == 'Negative'].head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
23,2405,Borderlands,Negative,the biggest dissappoinment in my life came out...
24,2405,Borderlands,Negative,The biggest disappointment of my life came a y...
25,2405,Borderlands,Negative,The biggest disappointment of my life came a y...
26,2405,Borderlands,Negative,the biggest dissappoinment in my life coming o...
27,2405,Borderlands,Negative,For the biggest male dissappoinment in my life...


In [5]:
#rename column 
data.columns = ['UserID', 'Borderlands', 'Sentiment',
       'posts']
data.head(2)

Unnamed: 0,UserID,Borderlands,Sentiment,posts
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...


## Creating a traditional NLP, using the above imported dataset

In [6]:
#check for missing values
data.isna().sum()

UserID           0
Borderlands      0
Sentiment        0
posts          686
dtype: int64

### first we preprocess the text data

In [7]:
#!pip install nlk     #installing package

In [8]:
from nltk.tokenize import word_tokenize, sent_tokenize    #importing our tokenization packages from NLTK

#### A little demo to show how NLTK punkt tokenizes data before we proceed with our actual data

In [9]:
import nltk
#nltk.download('punkt')               # Download tokenizer models
text = "baby, how are you doing today? hope you are good"
text1 = "baby how are you doing today hope you are good"
word_token_text, sent_token_text = word_tokenize(text), sent_tokenize(text)
word_token_text1, sent_token_text1 = word_tokenize(text1), sent_tokenize(text1)
word_token_text, word_token_text1 

(['baby',
  ',',
  'how',
  'are',
  'you',
  'doing',
  'today',
  '?',
  'hope',
  'you',
  'are',
  'good'],
 ['baby', 'how', 'are', 'you', 'doing', 'today', 'hope', 'you', 'are', 'good'])

In [10]:
sent_token_text, sent_token_text1

(['baby, how are you doing today?', 'hope you are good'],
 ['baby how are you doing today hope you are good'])

### Now we proceed to tokenize our actual data

In [11]:
data['posts'].dtype

dtype('O')

In [12]:
#change data type in case the data is not of type string or object
data['posts'].astype('str')


0        I am coming to the borders and I will kill you...
1        im getting on borderlands and i will kill you ...
2        im coming on borderlands and i will murder you...
3        im getting on borderlands 2 and i will murder ...
4        im getting into borderlands and i can murder y...
                               ...                        
74676    Just realized that the Windows partition of my...
74677    Just realized that my Mac window partition is ...
74678    Just realized the windows partition of my Mac ...
74679    Just realized between the windows partition of...
74680    Just like the windows partition of my Mac is l...
Name: posts, Length: 74681, dtype: object

In [13]:
data1 = data.copy()

In [14]:
data1

Unnamed: 0,UserID,Borderlands,Sentiment,posts
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [15]:
data['posts'].isna().sum()

686

In [16]:
data['Borderlands'].nunique()

32

In [17]:
data1 = data.dropna()

In [18]:
data1['word_token'] = data1['posts'].apply(lambda x: word_tokenize(x))   #applying tokenizer i.e to split each word in text
data1[['word_token', 'posts']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1['word_token'] = data1['posts'].apply(lambda x: word_tokenize(x))   #applying tokenizer i.e to split each word in text


Unnamed: 0,word_token,posts
0,"[I, am, coming, to, the, borders, and, I, will...",I am coming to the borders and I will kill you...
1,"[im, getting, on, borderlands, and, i, will, k...",im getting on borderlands and i will kill you ...
2,"[im, coming, on, borderlands, and, i, will, mu...",im coming on borderlands and i will murder you...
3,"[im, getting, on, borderlands, 2, and, i, will...",im getting on borderlands 2 and i will murder ...
4,"[im, getting, into, borderlands, and, i, can, ...",im getting into borderlands and i can murder y...


In [19]:
data1.head()

Unnamed: 0,UserID,Borderlands,Sentiment,posts,word_token
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"[I, am, coming, to, the, borders, and, I, will..."
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,"[im, getting, on, borderlands, and, i, will, k..."
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"[im, coming, on, borderlands, and, i, will, mu..."
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"[im, getting, on, borderlands, 2, and, i, will..."
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,"[im, getting, into, borderlands, and, i, can, ..."


In [20]:
 data1['posts'].info()

<class 'pandas.core.series.Series'>
Index: 73995 entries, 0 to 74680
Series name: posts
Non-Null Count  Dtype 
--------------  ----- 
73995 non-null  object
dtypes: object(1)
memory usage: 1.1+ MB


### After tokenizing text, we proceed to remove stop words

In [21]:
#import library for stop word removal
from nltk.corpus import stopwords
#nltk.download('stopwords')    #download stopwords dictionary

In [22]:
stop_words = set(stopwords.words('english'))    #load english stopwords
data1['filtered_tokens'] = data1['word_token'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
data1[['posts', 'filtered_tokens']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1['filtered_tokens'] = data1['word_token'].apply(lambda tokens: [word for word in tokens if word not in stop_words])


Unnamed: 0,posts,filtered_tokens
0,I am coming to the borders and I will kill you...,"[I, coming, borders, I, kill, ,]"
1,im getting on borderlands and i will kill you ...,"[im, getting, borderlands, kill, ,]"
2,im coming on borderlands and i will murder you...,"[im, coming, borderlands, murder, ,]"
3,im getting on borderlands 2 and i will murder ...,"[im, getting, borderlands, 2, murder, ,]"
4,im getting into borderlands and i can murder y...,"[im, getting, borderlands, murder, ,]"


### Haven removed stop words, we proceed to stemmatize the filtered(stop_words removed) tokens and return the data to its text form rather than the list it currently exist as

In [23]:
#import stemmatization library
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()


# Function to apply stemming and convert back to text
def stem_text(text):
    tokens = word_tokenize(text)  # Tokenize the text
    stemmed_tokens = [stemmer.stem(word) for word in tokens]  # Apply stemming
    return " ".join(stemmed_tokens)  # Convert back to text


# Apply the function to the 'text' column
data1['stemmed_text'] = data1['posts'].apply(stem_text)

data1[['posts', 'stemmed_text']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1['stemmed_text'] = data1['posts'].apply(stem_text)


Unnamed: 0,posts,stemmed_text
0,I am coming to the borders and I will kill you...,"i am come to the border and i will kill you all ,"
1,im getting on borderlands and i will kill you ...,"im get on borderland and i will kill you all ,"
2,im coming on borderlands and i will murder you...,"im come on borderland and i will murder you all ,"
3,im getting on borderlands 2 and i will murder ...,im get on borderland 2 and i will murder you m...
4,im getting into borderlands and i can murder y...,"im get into borderland and i can murder you all ,"


### Next to do is tag each word in text based on their individual part of speech (POS TAGGING)

In [24]:
#import library for POS tagging
# nltk.download("averaged_perceptron_tagger")


In [25]:
# data1['POS_TAG'] = data1['stemmed_token'].apply(lambda a: nltk.pos_tag(a))     #apply POS tagging to each word 
# data1[['POS_TAG', 'stemmed_token']]

In [26]:
# data[['POS_TAG', 'posts']].head()

### Now we have to convert the stemmed tokenized data back to text, else the data remains as a list of data with each text as a separate entity

### VECTORIZE TEXT DATA 
#### we'd use TF-IDF since we are trying to analysis sentiment
TF-IDF weighs words based on importance, reducing the impact of frequent words like "the", "is", etc.

In [27]:
#import text vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)  #max_features=5000 → Keep only the most important 5,000 words.
x = data1['stemmed_text']
feature_matrix = vectorizer.fit_transform(x)               # Sparse format (efficient!)

# Print TF-IDF matrix
print("Feature names:", vectorizer.get_feature_names_out())
# print(feature_matrix.toarray())  
'''
👆this "feature_matrix.toarray()" converts the sparse matrix to a dense array of 74681 × 25441 = 1.9 billion values, even if most are zeros.
Thereby overwhelming the system and its memory.

to handle this we avoid using ".toarray()", we Keep It as a Sparse Matrix bt leaving it like that. i.e after "vectorizer.fit_transform(data)"

However, If you want to view the data in a DataFrame but avoid .toarray(), use pd.DataFrame.sparse.from_spmatrix():
'''

matrix = feature_matrix  # Check dimensions
matrix

Feature names: ['00' '000' '00016' ... 'zuckerberg' 'безопасно' 'яй']


<73995x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1121295 stored elements in Compressed Sparse Row format>

In [28]:
matrix.shape

(73995, 5000)

In [29]:
pd.options.display.max_columns = 100
feature_matrix[:1][0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Now we proceed to build our model but first we have to assign our x and y values and split into train-test

In [30]:
#assigning x and y
x1, y = matrix, data1['Sentiment']

In [31]:
x1

<73995x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1121295 stored elements in Compressed Sparse Row format>

In [32]:
x1[:1],y[:1]

(<1x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 10 stored elements in Compressed Sparse Row format>,
 0    Positive
 Name: Sentiment, dtype: object)

In [33]:
#split data into trian test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x1,y, random_state = 69, train_size = 0.7)

## Import model (naive bayes)

In [34]:
from sklearn.naive_bayes import MultinomialNB


In [35]:
x_train.shape, y_train.shape

((51796, 5000), (51796,))

In [36]:
x_train

<51796x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 783370 stored elements in Compressed Sparse Row format>

In [37]:
NBmodel = MultinomialNB()
NBmodel.fit(x_train, y_train)

In [38]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, train_size = 0.5, random_state =49)

In [39]:
NBmodel.score(x_valid, y_valid)

0.6459140463104784

In [40]:
NBmodel.score(x_train, y_train)

0.6710170669549772

#### Using a simpler model like the logistic regression

In [41]:
#try using logistic regression
from sklearn.linear_model import LogisticRegression
LRmodel= LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=100)
LRmodel.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
LRmodel.score(x_train, y_train), LRmodel.score(x_valid, y_valid)

(0.7400185342497491, 0.6896116767276331)

#### clearly our model has failed to capture underlying patterns thereby leading to low score for both train and test,
so we trace back to our feature engineering to reprocess our data.
 The issue might be with how text is converted into numerical features. Try the following:

✅ Increase n-grams: Instead of just single words (unigrams), use bigrams or trigrams in your TF-IDF vectorizer.

✅ Try Word Embeddings: Instead of just TF-IDF, try Word2Vec, GloVe, or FastText to capture semantic meaning.

✅ Use Stopword Removal: Removing stopwords helps reduce noise in the data.

In [43]:
#first we change the n-gram TF-IDF parameteer to help the model capture contextual meaning as unigram is the default value and only captures word importance(not context).
vectorizer1 = TfidfVectorizer()  

feature_matrix1 = vectorizer1.fit_transform(x)               # Sparse format (efficient!)

# Print TF-IDF matrix
print("Feature names:", vectorizer1.get_feature_names_out())

matrix1 = feature_matrix1  # Check dimensions
matrix1

Feature names: ['00' '000' '00011' ... 'การออกอากาศของฉ' 'นจาก' 'ℐℓ٥']


<73995x25440 sparse matrix of type '<class 'numpy.float64'>'
	with 1207950 stored elements in Compressed Sparse Row format>

In [44]:
x2 = feature_matrix1

### split the newly generated data

In [45]:
#split data into train, test and validation
x_train1, x_test1, y_train1, y_test1 = train_test_split(x2,y, train_size  = 0.7, random_state =65)
x_test1, x_valid1, y_test1, y_valid1 = train_test_split(x_test1, y_test1)

In [46]:
#check for data imbalance
pd.options.display.max_rows = 53850          #neutral = 18318, negative = 22542, irrelevant = 12990, positive = 20831
data[data['Sentiment']== 'Neutral'].count()

UserID         18318
Borderlands    18318
Sentiment      18318
posts          18108
dtype: int64

In [47]:
LRmodel1 = LogisticRegression(multi_class='multinomial', class_weight = 'balanced', solver = 'lbfgs', penalty = 'l2', max_iter=100)
#the class_weight as balance is used to balance the classes in the data, thereby handling the imbalance that exist
#penalty is used to handle the high dimensionality of the data. Instead of going through the long part of using PCA to reduce data dimensionality
LRmodel.fit(x_train1, y_train1)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
LRmodel.score(x_train1, y_train1),LRmodel.score(x_valid1, y_valid1)

(0.8172638813808016, 0.7484684684684685)

### Instead of TF-IDF,we will use Word2Vec, GloVe or FastText embeddings with traditional models like SVM, LR as the TF-IDF was unable to observe underlying pattern in the data

In [49]:
#!pip install gensim    #import model for word embeddings

In [50]:
import gensim
from gensim.models import Word2Vec

##### Note: word2Vec takes tokenized text and not stemmed text, so we parse the tokenized text to it

In [51]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences = data1['filtered_tokens'], vector_size = 100, window=3, min_count=1, workers=4)
'''
EXPLAINING CODE:
"vector_size=100" Defines the size (dimensions) of word embeddings. It sets the number of dimensions for each word vector.
Higher values capture more semantic information but require more training data.
Lower values train faster but may miss details.

"window=5" Defines the context window size. It controls how many words before and after the target word are considered as context.
A larger window (e.g., window=10) captures more context but may introduce noise.
A smaller window (e.g., window=2) focuses on local dependencies (like phrases).



"min_count=1" Defines the minimum number of occurrences a word must have to be included in training
min_count=1 → Include all words (even rare ones).
min_count=5 → Ignore words that appear less than 5 times
why use "min_count" Higher values (e.g., 5, 10): Remove rare words to speed up training and improve generalization.
Lower values (e.g., 1, 2): Keep all words, which may be useful for small datasets.

"workers=4" Defines the number of CPU threads used for training
More workers = Faster training (if your CPU has multiple cores).
Recommended value = Number of CPU cores available.
e.g If you have a quad-core CPU, setting workers=4 uses all 4 cores for training.
If you set workers=1, it will train much slower.
Common values for "workers"  :  workers=1 (single-core, slow)
                                workers=4 (quad-core, recommended)
                                workers=8+ (high-performance machines)

'''





'\nEXPLAINING CODE:\n"vector_size=100" Defines the size (dimensions) of word embeddings. It sets the number of dimensions for each word vector.\nHigher values capture more semantic information but require more training data.\nLower values train faster but may miss details.\n\n"window=5" Defines the context window size. It controls how many words before and after the target word are considered as context.\nA larger window (e.g., window=10) captures more context but may introduce noise.\nA smaller window (e.g., window=2) focuses on local dependencies (like phrases).\n\n\n\n"min_count=1" Defines the minimum number of occurrences a word must have to be included in training\nmin_count=1 → Include all words (even rare ones).\nmin_count=5 → Ignore words that appear less than 5 times\nwhy use "min_count" Higher values (e.g., 5, 10): Remove rare words to speed up training and improve generalization.\nLower values (e.g., 1, 2): Keep all words, which may be useful for small datasets.\n\n"workers=

In [52]:
# Function to convert a sentence to a vector
def sentence_to_vector(tokens, model):
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(100)  # Return zero vector if no words in vocab
    return np.mean(word_vectors, axis=0)  # Average of word vectors

# Convert all sentences to vectors
data1["vector"] = data1["filtered_tokens"].apply(lambda xx: sentence_to_vector(xx, word2vec_model))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data1["vector"] = data1["filtered_tokens"].apply(lambda xx: sentence_to_vector(xx, word2vec_model))


In [53]:
data1.head(2)

Unnamed: 0,UserID,Borderlands,Sentiment,posts,word_token,filtered_tokens,stemmed_text,vector
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"[I, am, coming, to, the, borders, and, I, will...","[I, coming, borders, I, kill, ,]","i am come to the border and i will kill you all ,","[0.60630983, 1.0543424, -0.8493102, -0.7768115..."
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,"[im, getting, on, borderlands, and, i, will, k...","[im, getting, borderlands, kill, ,]","im get on borderland and i will kill you all ,","[1.5475639, 0.74700814, -0.86170673, -0.891654..."


In [54]:
# Prepare Data
xvec = np.vstack(data1["vector"])  #Feature matrix


# Split into Train & Test
x_train3, x_test3, y_train3, y_test3 = train_test_split(xvec, y, test_size=0.3, random_state=42)

In [55]:
#haven proceed our data we fit model 
LRmodel1 = LogisticRegression(multi_class='multinomial', class_weight = 'balanced', solver = 'lbfgs', penalty = 'l2', max_iter=100)
LRmodel1.fit(x_train3, y_train3)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [56]:
LRmodel1.score(x_train3, y_train3),LRmodel1.score(x_test3, y_test3)

(0.4928372847324118, 0.4903824496598946)

## Our traditional NLP functioned better when we used the nltk text features than when we usedd the word2vec

In [57]:
#tokenize data
import nltk   #download tokenize package
# nltk.download('punkt')
# data[['word_token']] = data['im getting on borderlands and i will murder you all ,'].apply(word_tokenize)
# data[['sent_token']] = data['im getting on borderlands and i will murder you all ,'].apply(sent_tokenize)
# print('word-tokenize',word_token )

In [58]:
# !pip show nltk

In [59]:
# !pip show spacy

In [60]:
# import spacy
# nlp = spacy.load("en_core_web_sm")


# text = "Hello world! Tokenizing text data is fun."
# tokens = [token.text for token in nlp(text)]


# print(tokens)
