# Text Mining Part 2

After we pre-process the text, we can perform text mining in a better condition. Here we will do text classification and topic modeling.


In [0]:
# Install Library
!pip install nltk

In [0]:
# Import Library
import nltk

## **1. Text Classification**

Text classification is the process of assigning tags or categories to text according to its content. It’s one of the fundamental tasks in Natural Language Processing (NLP) with broad applications such as sentiment analysis, topic labeling, spam detection, and intent detection.

#### **1.1. Sentiment Analysis**

Sentiment analysis is the interpretation and classification of emotions (positive, negative and neutral) within text data using text analysis techniques. 

##### **a. Using Predefined Model (Only English)**

***Install Library and Import Module***

In [0]:
# Install Library
! pip install vaderSentiment

In [0]:
# Install Module
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

***Create Sentiment Analysis Function***

In [0]:
# Create Sentiment Analysis Function
analyzer = SentimentIntensityAnalyzer()
def sentiment_analyzer_scores(sentence):
    score = analyzer.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))

***Detect the Text Sentiment***

In [0]:
# Input English Text
text_en = 'The death toll from the coronavirus has reached 28 in South Korea with 600 newly confirmed cases, raising the national Itally to 4,812 cases, the South Korean Centers for Disease Control and Prevention (KCDC) said in a news release Tuesday.'
text_en

In [0]:
# Detect the Text Sentiment
sentiment_analyzer_scores(text_en)

##### **b. Using Machine Learning Principle (Customizable)**

***Import Library and Modules***

In [0]:
# Import Library
import pandas as pd 

In [0]:
# Import Modules
from sklearn.feature_extraction.text import CountVectorizer # to create Bag of words
from sklearn.model_selection import train_test_split  # for splitting data
from sklearn.naive_bayes import GaussianNB # to bulid classifier model
from sklearn.preprocessing import LabelEncoder # to convert classes to number 
from sklearn.metrics import accuracy_score # to calculate accuracy

***Import Data***

In [0]:
# Import Train Data
df_grab = pd.read_csv('https://raw.githubusercontent.com/dhitology/temporary/master/grab-tweet.csv', sep = ';')
df_grab

In [0]:
# Count the Sentiment
df_grab.sentiment.value_counts()

***Set Feature and Target***

Set Feature

In [0]:
# Feature Extraction (Word Embedding)
count_vector = CountVectorizer(max_features = 1500)  
grab_feature = count_vector.fit_transform(df_grab['text']).toarray() 
grab_feature_matrix = pd.DataFrame(grab_feature,columns=count_vector.get_feature_names())
grab_feature_matrix.head()

Set Target

In [0]:
# Encode Target
encoder = LabelEncoder()
grab_label = encoder.fit_transform(df_grab['sentiment'])
grab_label

***Set Training and Testing Data***

In [0]:
# Set Training and Testing Data (70:30)
feature_train, feature_test, target_train, target_test = train_test_split(grab_feature, grab_label, shuffle = True, test_size=0.3, random_state=1)

# Show the Training and Testing Data
print(feature_train.shape)
print(feature_test.shape)
print(target_train.shape)
print(target_test.shape)

***Construct Naive Bayes Sentiment Classifier***

In [0]:
# Train Naive Bayes Model
nb = GaussianNB().fit(feature_train, target_train)

# Predict to Test Data
target_predicted = nb.predict(feature_test) 
target_predicted 

***Show Accuration***

In [0]:
print('Test model accuracy: ',accuracy_score(target_test, target_predicted))

***Predict Sentiment***

In [0]:
# Input New Statement
new_statement = ['saya tidak suka grab'] 

# Extract Features
new_statement_features = count_vector.transform(new_statement).toarray()

## encodeing predict class
predict_sentiment = encoder.inverse_transform(nb.predict(new_statement_features))
print(new_statement[0], 'sentiment: ',predict_sentiment[0])

#### **1.2. Hate Speech Detection**

This aims to classify textual content into non-hate or hate speech, in which case the method may also identify the targeting characteristics (i.e., types of hate, such as race, and religion) in the hate speech.

***Import Library and Modules***

In [0]:
# Import Library
import pandas as pd 

In [0]:
# Import Modules
from sklearn.feature_extraction.text import CountVectorizer # to create Bag of words
from sklearn.model_selection import train_test_split  # for splitting data
from sklearn.naive_bayes import GaussianNB # to bulid classifier model
from sklearn.preprocessing import LabelEncoder # to convert classes to number 
from sklearn.metrics import accuracy_score # to calculate accuracy

***Import Data***

In [0]:
df_hs = pd.read_csv('https://raw.githubusercontent.com/dhitology/temporary/master/data_hs.csv', sep = ";")
df_hs

In [0]:
# count of each type 
df_hs.label.value_counts()

***Set Feature and Target***

Set Feature

In [0]:
# Feature Extraction (Word Embedding)
count_vector = CountVectorizer(max_features = 1500)  
hs_feature = count_vector.fit_transform(df_hs['text']).toarray() 
hs_feature_matrix = pd.DataFrame(hs_feature,columns=count_vector.get_feature_names())
hs_feature_matrix.head()

Set Target

In [0]:
# Encode Target
encoder = LabelEncoder()
hs_label = encoder.fit_transform(df_hs['label'])
hs_label

***Set Training and Testing Data***

In [0]:
# Set Training and Testing Data (70:30)
feature_train, feature_test, target_train, target_test = train_test_split(hs_feature, hs_label, shuffle = True, test_size=0.3, random_state=1)

# Show the Training and Testing Data
print(feature_train.shape)
print(feature_test.shape)
print(target_train.shape)
print(target_test.shape)

***Construct Naive Bayes Sentiment Classifier***

In [0]:
# Train Naive Bayes Model
nb = GaussianNB().fit(feature_train, target_train)

# Predict to Test Data
target_predicted = nb.predict(feature_test) 
target_predicted 

***Show Accuration***

In [0]:
print('Test model accuracy: ',accuracy_score(target_test, target_predicted))

***Predict Label***

In [0]:
# Input New Statement
new_statement = ['Dia Bangsat Perusak Negara'] 

# Extract Features
new_statement_features = count_vector.transform(new_statement).toarray()

## encodeing predict class
predict_label = encoder.inverse_transform(nb.predict(new_statement_features))
print(new_statement[0], 'sentiment: ',predict_label[0])

## **2. Topic Modelling**

Topic model is a type of statistical model for discovering the abstract "topics" that occur in a collection of documents. Topic modeling is a frequently used text-mining tool for discovery of hidden semantic structures in a text body. Intuitively, given that a document is about a particular topic, one would expect particular words to appear in the document more or less frequently: "dog" and "bone" will appear more often in documents about dogs, "cat" and "meow" will appear in documents about cats, and "the" and "is" will appear equally in both. 

***Install Library, Import Libraries, and Import Modules***

In [0]:
# Install Library
! pip install pyLDAvis

In [0]:
# Import Libraries
import nltk
import os
import numpy as np, pyLDAvis, pyLDAvis.sklearn; pyLDAvis.enable_notebook()

# Import Modules
from __future__ import print_function 
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from matplotlib import pyplot as plt

In [0]:
# Clone Library and Data from Github
! git clone https://github.com/dianrdn/tm

# Set Data Directory
os.chdir('tm')

***Import Data***

In [0]:
# Import Stop Words
nltk.download('stopwords')

# Import Data
data_file = 'berita_batubara.csv'

# Load Tweets Data
import MyLib as TS
Tweets = TS.LoadTxt(data_file) 
print('Total loaded tweets = {0}'.format(len(Tweets)))

***Set Number of Topics, Top Topics, Top Words, Max DF, Min DF***

In [0]:
n_topics = 4
top_topics = 4
top_words = 8
max_df = 0.75
min_df = 10

***Word Embedding***

In [0]:
# Feature Extraction (Word Embedding)
count_vector = CountVectorizer(lowercase = True, token_pattern = r'\b[a-zA-Z]{3,}\b',max_df = max_df, min_df = min_df) 
dtm_tf = count_vector.fit_transform(Tweets)
tf_terms = count_vector.get_feature_names()
del Tweets

***Show Topic***

In [0]:
# Topic Search Function
lda_tf = LatentDirichletAllocation(n_components=n_topics, learning_method='online', random_state=0).fit(dtm_tf)

# Show Topics
vsm_topics = lda_tf.transform(dtm_tf); doc_topic =  [a.argmax()+1 for a in tqdm(vsm_topics)] # topic of docs
print('In total there are {0} major topics, distributed as follows'.format(len(set(doc_topic))))
plt.hist(np.array(doc_topic), alpha=0.5); plt.show()
print('Printing top {0} Topics, with top {1} Words:'.format(top_topics, top_words))
TS.print_Topics(lda_tf, tf_terms, top_topics, top_words)

In [0]:
# Interactively visualizing the Topics, please ignore the Warnings
# Wait few minutes and then hover the Mouse over the Topics to Explore
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, count_vector) 