<h1>Text Classification using Python (e-Participation 2.1)</h1>


<h1><b>INSTALLING PACKAGES</b></h1>

In [11]:
pip install pandas



In [12]:
pip install seaborn



In [13]:
pip install sklearn

Collecting sklearn
  Using cached sklearn-0.0.post10.tar.gz (3.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [14]:
pip install scikit-learn



In [15]:
pip install gensim



<h1><b>IMPORTING LIBRARIES</b></h1>

In [None]:
#CELL No. 1

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt') #divides a text into list of sentences
nltk.download('averaged_perceptron_tagger') #POS tagger
nltk.download('wordnet')

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

#performance metrics
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
#from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec #Word2Vec is mostly used for huge datasets




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


<h1><b>LOADING AND EXPLORING THE DATASET</b></h1>

In [None]:
#CELL NO. 2

# Import social media dataset and load to a dataframe

df_uaqte=pd.read_csv('uaqte_balanced_dataset.csv')
print(df_uaqte.shape)
df_uaqte.head(10)


FileNotFoundError: ignored

In [None]:
#CELL NO. 3
# CLASS DISTRIBUTION – check if dataset is balanced or not

# Labels:
# 0 - negative
# 1 - positive
# 2 - neutral

x=df_uaqte['label'].value_counts()
print(x)
sns.barplot(x=x.index, y=x)



In [None]:
#CELL NO. 4
#WORD-COUNT
print('Word Count:')
df_uaqte['word_count'] = df_uaqte['text'].apply(lambda x: len(str(x).split()))
print('\tPositive Comment/Text: ', df_uaqte[df_uaqte['label']==1]['word_count'].mean()) #Positive
print('\tNegative Comment/Text: ', df_uaqte[df_uaqte['label']==0]['word_count'].mean()) #Negative
print('\tNeutral Comment/Text: ', df_uaqte[df_uaqte['label']==2]['word_count'].mean()) #Neutral

#2. CHARACTER-COUNT
print('\nCharacter Count:')
df_uaqte['char_count'] = df_uaqte['text'].apply(lambda x: len(str(x)))
print('\tPositive Comment/Text: ', df_uaqte[df_uaqte['label']==1]['char_count'].mean()) #Positive
print('\tNegative Comment/Text: ', df_uaqte[df_uaqte['label']==0]['char_count'].mean()) #Negative
print('\tNeutral Comment/Text: ', df_uaqte[df_uaqte['label']==2]['char_count'].mean()) #Neutral


#3. UNIQUE WORD-COUNT
print('\nUnique Word Count:')
df_uaqte['unique_word_count'] = df_uaqte['text'].apply(lambda x: len(set(str(x).split())))
print('\tPositive Comment/Text: ', df_uaqte[df_uaqte['label']==1]['unique_word_count'].mean()) #Positive
print('\tNegative Comment/Text: ', df_uaqte[df_uaqte['label']==0]['unique_word_count'].mean()) #Negative
print('\tNeutral Comment/Text: ', df_uaqte[df_uaqte['label']==2]['unique_word_count'].mean()) #Neutral


In [None]:
#CELL NO. 5
#Plotting word-count per label/category

#plot for positive sentiments
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,4))
train_words=df_uaqte[df_uaqte['label']==1]['word_count']
ax1.hist(train_words,color='red')
ax1.set_title('Negative')

#plot for negative sentiments
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,4))
train_words=df_uaqte[df_uaqte['label']==0]['word_count']
ax1.hist(train_words,color='blue')
ax1.set_title('Negative')

#plot for neutral sentiments
train_words=df_uaqte[df_uaqte['label']==2]['word_count']
ax2.hist(train_words,color='green')
ax2.set_title('Neutral')
fig.suptitle('Words per text')
plt.show()


<h1><b>PRE-PROCESSING</b></h1>
<br>
</t>Next cell demonstrates how to preprocess the dataset by removing punctuations & special characters, cleaning texts, removing stop words, and applying lemmatization

<h5>1. Simple Text Cleaning</h5>

In [None]:
#CELL NO. 6

#1. Common text preprocessing
text = "   This is a message to be cleaned. It may involve some things like: , ?, :, ''  adjacent spaces and tabs     .  "

#convert to lowercase and remove punctuations and characters and then strip
def preprocess(text):
    text = text.lower() #lowercase text
    text=text.strip()  #get rid of leading/trailing whitespace
    text=re.compile('<.*?>').sub('', text) #Remove HTML tags/markups
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  #Replace punctuation with space. Careful since punctuation can sometime be useful
    text = re.sub('\s+', ' ', text)  #Remove extra space and tabs
    text = re.sub(r'\[[0-9]*\]',' ',text) #[0-9] matches any digit (0 to 10000...)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) #matches any digit from 0 to 100000..., \D matches non-digits
    text = re.sub(r'\s+',' ',text) #\s matches any whitespace, \s+ matches multiple whitespace, \S matches non-whitespace

    return text

text=preprocess(text)
print(text)  #text is a string

In [None]:
#CELL NO.7

import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
# Get the list of stopwords for a specific language (e.g., English)
stopwords_list = stopwords.words('english')

# Print the list of stopwords
print(stopwords_list)


In [None]:
#CELL NO.8

# Define a list of common Tagalog stopwords
tagalog_stopwords = [
    'ako', 'alin', 'am', 'amin', 'aming', 'ang', 'ano', 'anumang', 'apat', 'at',
    'atin', 'ating', 'ay', 'bababa', 'bago', 'bakit', 'bawat', 'bilang', 'dahil',
    'dalawa', 'dapat', 'din', 'dito', 'doon', 'gagawin', 'gayunman', 'ginagawa',
    'ginawa', 'ginawang', 'gumawa', 'gusto', 'habang', 'hanggang', 'hindi', 'huwag',
    'iba', 'ibaba', 'ibabaw', 'ibig', 'ikaw', 'ilagay', 'ilalim', 'ilan', 'inyong',
    'isa', 'isang', 'itaas', 'ito', 'iyo', 'iyon', 'iyong', 'ka', 'kahit', 'kailangan',
    'kailanman', 'kami', 'kanila', 'kanilang', 'kanino', 'kanya', 'kanyang', 'kapag',
    'kapwa', 'karamihan', 'katiyakan', 'katulad', 'kaya', 'kaysa', 'ko', 'kong', 'kulang',
    'kumuha', 'kung', 'laban', 'lahat', 'lamang', 'likod', 'lima', 'maaari', 'maaaring',
    'maging', 'mahusay', 'makita', 'marami', 'marapat', 'mga', 'minsan', 'mismo', 'mula',
    'muli', 'na', 'nabanggit', 'naging', 'nagkaroon', 'nais', 'nakita', 'namin', 'napaka',
    'narito', 'nasaan', 'ng', 'nga', 'ngayon', 'ni', 'nila', 'nilang', 'nito', 'niya',
    'niyang', 'noon', 'o', 'pa', 'paano', 'pababa', 'paggawa', 'pagitan', 'pagkakaroon',
    'pagkatapos', 'palabas', 'pamamagitan', 'panahon', 'pangalawa', 'para', 'paraan',
    'pareho', 'pataas', 'pero', 'pumunta', 'pumupunta', 'sa', 'saan', 'sabi', 'sabihin',
    'sarili', 'sila', 'sino', 'siya', 'tatlo', 'tayo', 'tulad', 'tungkol', 'una', 'walang',
    'ito', 'iyan'
]

# Print the list of Tagalog stopwords
print(tagalog_stopwords)


<h5>2. Lexicon-based Text Preprocessing</h5><br>
 a. Stopword removal - removing insignificant words from English vocabulary using nltk. A few such words are ‘i’,’you’,’a’,’the’,’he’,’which’ etc.
<br> b. Stemming - process of slicing the end or the beginning of words with the intention of removing affixes(prefix/suffix)
<br> c. Lemmatization - process of reducing the word to its base form

In [None]:
#CELL NO.9
# LEXICON-BASED TEXT PROCESSING EXAMPLES

#1. STOP WORDS REMOVAL
def stopword(string):
    english_stopwords = stopwords.words('english')
    combined_stopwords = english_stopwords + tagalog_stopwords

    words = [word for word in string.split() if word.lower() not in combined_stopwords]
    return ' '.join(words)

text=stopword(text)
print(text)

#2. STEMMING

# Initialize the stemmer
snow = SnowballStemmer('english')
def stemming(string):
    a=[snow.stem(i) for i in word_tokenize(string) ]
    return " ".join(a)
text=stemming(text)
print(text)

#3. LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()

# This is a helper function to map NTLK position tags
# Full list is available here: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

text = lemmatizer(text)
print(text)


<h5>Final Preprocessing on our Dataset</h5><br>
Applying all the preprocessing functions defined above to the data frame (df_uaqte / uaqte_balanced_dataset.csv)

In [None]:
#CELL NO.10

def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
df_uaqte['clean_text'] = df_uaqte['text'].apply(lambda x: finalpreprocess(x))

df_uaqte.head(10)


<h1>FEATURE EXTRACTION</h1>
Extracting vectors from text

<b>Splitting the dataset using 80:20 ratio. 80% as training set and 20% as test set</b>

In [None]:
#CELL NO.11

#SPLITTING THE TRAINING DATASET INTO TRAIN AND TEST

X_train, X_val, y_train, y_val = train_test_split(df_uaqte["clean_text"],
                                                  df_uaqte["label"],
                                                  test_size=0.2,
                                                  shuffle=True)

# Word2Vec runs on tokenized sentences
X_train_tok= [nltk.word_tokenize(i) for i in X_train]  #for word2vec
X_val_tok= [nltk.word_tokenize(i) for i in X_val]      #for word2vec

print("DONE SPLITTING AND WORK TOKENIZING.")


<b>Extracting features/ vectors using Bag-of-words(with Tf-
   Idf) and Word2Vec</b>
   

In [None]:
#CELL NO.12
# create Word2vec model

df_uaqte['clean_text_tok']=[nltk.word_tokenize(i) for i in df_uaqte['clean_text']] #convert preprocessed sentence to tokenized sentence
model = Word2Vec(df_uaqte['clean_text_tok'],min_count=1)  #min_count=1 means word should be present at least across all documents,
#if min_count=2 means if the word is present less than 2 times across all the documents then we shouldn't consider it

w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))  #combination of word and its vector

#for converting sentence to vectors/numbers from word vectors result by Word2Vec
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

print("DONE RUNNING.")



In [None]:
#CELL NO. 13

#TF-IDF
# Convert x_train to vector since model can only run on numbers and not words- Fit and transform
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) #tfidf runs on non-tokenized sentences unlike word2vec

# Only transform x_test (not fit and transform)
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val) #Don't fit() your TfidfVectorizer to your test data: it will

#change the word-indexes & weights to match test data. Rather, fit on the training data, then use the same train-data-
#fit model on the test data, to reflect the fact you're analyzing the test data only based on what was learned without
#it, and the have compatible

#Word2vec
# Fit and transform
modelw = MeanEmbeddingVectorizer(w2v)
X_train_vectors_w2v = modelw.transform(X_train_tok)
X_val_vectors_w2v = modelw.transform(X_val_tok)

print("DONE CREATING VECTORS.")


<h1>TRAINING MODELS</h1>

<h3>Multinomial Logistic Regression with TF-IDF</h3>

In [1]:
#CELL NO.14
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)
lr_tfidf=LogisticRegression(solver = 'lbfgs', multi_class='multinomial', max_iter=1000)
lr_tfidf.fit(X_train_vectors_tfidf, y_train)  #model

#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_val_vectors_tfidf)

#Generate confusion matrix
conf_matrix = confusion_matrix (y_val, y_predict)

#Print accuracy score and classification report
print('Accuracy: %s\n' % metrics.accuracy_score(y_predict, y_val))
print(classification_report(y_val,y_predict))
print('Confusion Matrix: \n',conf_matrix)

# Plot confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=lr_tfidf.classes_, yticklabels=lr_tfidf.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()



NameError: name 'LogisticRegression' is not defined

<h3>Naive Bayes with TF-IDF</h3>

In [None]:
#CELL NO.14

#FITTING THE CLASSIFICATION MODEL using Naive Bayes(tf-idf)

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train)  #model

#Predict y value for test dataset
y_predict = nb_tfidf.predict(X_val_vectors_tfidf)

#Generate confusion matrix
conf_matrix = confusion_matrix (y_val, y_predict)

#Print accuracy score and classification report
print('Accuracy: %s\n' % metrics.accuracy_score(y_predict, y_val))
print(classification_report(y_val,y_predict))
print('Confusion Matrix: \n',conf_matrix)

# Plot confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=nb_tfidf.classes_, yticklabels=nb_tfidf.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()



<h3>Multinomial Logistic Regression with Word2Vec</h3>

In [None]:
#CELL NO.15

#FITTING THE CLASSIFICATION MODEL using Logistic Regression (W2v)
lr_w2v=LogisticRegression(solver = 'lbfgs', multi_class='multinomial', max_iter=1000)
lr_w2v.fit(X_train_vectors_w2v, y_train)  #model

#Predict y value for test dataset
y_predict = lr_w2v.predict(X_val_vectors_w2v)

#Generate confusion matrix
conf_matrix = confusion_matrix (y_val, y_predict)

#Print accuracy score and classification report
print('Accuracy: %s\n' % metrics.accuracy_score(y_predict, y_val))
print(classification_report(y_val,y_predict))
print('Confusion Matrix: \n',conf_matrix)

# Plot confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=lr_w2v.classes_, yticklabels=lr_w2v.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()



<h3>Linear SVM with Word2Vec</h3>

In [None]:
#CELL NO.16
#FITTING THE CLASSIFICATION MODEL using Linear SVM (W2v)
svm_w2v=sgd = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=123, max_iter=5, tol=None)

svm_w2v.fit(X_train_vectors_w2v, y_train)#model

#Predict y value for test dataset
y_predict = svm_w2v.predict(X_val_vectors_w2v)

#Generate confusion matrix
conf_matrix = confusion_matrix (y_val, y_predict)

#Print accuracy score and classification report
print('Accuracy: %s\n' % metrics.accuracy_score(y_predict, y_val))
print(classification_report(y_val,y_predict))
print('Confusion Matrix: \n',conf_matrix)

# Plot confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=svm_w2v.classes_, yticklabels=svm_w2v.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


<h1>GENERATE PREDICTIONS USING THE BEST CLASSIFIER MODEL</h1>

In [None]:
#CELL NO.17

#Testing it on new dataset with the best model
df_test=pd.read_csv('make_predictions.csv')  #reading the data
df_test['clean_text'] = df_test['text'].apply(lambda x: finalpreprocess(x)) #preprocess the data
X_test=df_test['clean_text']

X_vector=tfidf_vectorizer.transform(X_test) #converting X_test to vector
y_predict = lr_tfidf.predict(X_vector)      #use the trained model on X_vector
y_prob = lr_tfidf.predict_proba(X_vector)[:,1]
df_test['predict_prob']= y_prob
df_test['label']= y_predict

print(df_test.head())
final=df_test[['text','label']].reset_index(drop=True)
final.to_csv('submission.csv')

