In [None]:
#**** Sentiment Analysis using Dutch Tweets****
# by SELIM SAMETOGLU
# This is a secondary version where I try to implement pipelines from scikitlearn


## Load dependencies

In [None]:
##### Has to be installed/checked at each runtime
# !python -m spacy download nl_core_news_sm
# !pip install langdetect
# nltk.download('stopwords')

##### to connect to drive
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# load the dependencies
# Utility
import pandas as pd
import numpy as np
import re
# Prepro
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
# Plotting
from matplotlib import pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report


In [None]:
# Load the dataset
data = pd.read_json("/content/drive/MyDrive/Collab_data/dutch_tweets_chunk0.json")
# Check the lenght of the data
# print('number of messages:', len(data))
# data[["sentiment_pattern"]].hist()
# data[["sentiment_pattern"]]
# dichotimize the sentiment categories
data.loc[data["sentiment_pattern"] <0, "sentiment_pattern"] = 0
data.loc[data["sentiment_pattern"] >0, "sentiment_pattern"] = 1
# check with a histogram whether it worked
# data[["sentiment_pattern"]].hist()
# take only a small portion of the data for better speed
data = data[:5000]


# Using Custom Transformers and Scikit pipelines for preprocessing

### step 1: Create a custom transformer for filtering out 'Non-Dutch' sentences

In [None]:
#  Creating a preprocessing step with
from sklearn.base import BaseEstimator, TransformerMixin
from langdetect import detect

class LanguageFilter(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

# filter out non-Dutch sentences
  def detect_language(self, X, y=None):
    try:
        return detect(X)
    except:
        return 'unknown'

  def fit(self, X, y = None):
    return self

  def transform(self, X, y= None):
    X['language'] = X['full_text'].apply(self.detect_language)
    X = X[X['language'] == 'nl']
    X.drop('language', axis = 1, inplace = True)
    return X

### step 2: column manipulations and turning 'statements into lower case'

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CoLo(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass
  def fit(self, X, y= None):
    return self
  def transform(self, X, y=None):
    X = X[["full_text", "sentiment_pattern"]]
    X.columns = ["text", "label"]
    X['text'] = X['text'].str.lower()
    return X


### step 3: clean and remove the stopwords from the text

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
class Cleaner(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y =None):
    return self

  def transform(self, X, y=None):
    stopwordlist = ['aan','al','alles','als', 'altijd','andere', 'ben','bij',
    'daar','dan','dat','de','der','deze','die','dit','doch','doen','door','dus',
    'een','eens','en','er','ge','geen','geweest','haar','had','heb','hebben','heeft',
    'hem','het','hier','hij','hoe','hun','iemand','iets','ik','in','is','ja',
    'je','kan','kon','kunnen','maar','me','meer','men','met','mij','mijn','moet',
    'na','naar','niet','niets','nog','nu','of','om','omdat','onder','ons','ook',
    'op','reeds','te','tegen','toch','toen','tot','u','uit','uw','van','veel','voor',
    'want','waren','was','wat','werd','wezen','wie','wil','worden','wordt','zal',
    'ze','zelf','zich','zij','zijn','zo','zonder','zou']
    STOPWORDS = set(stopwordlist)
    X['text'] = X['text'].apply(lambda a: " ".join([word for word in a.split() if word not in STOPWORDS]))
    return X


### step 4: remove punctuations

In [None]:
class RemovePunctuations(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    import string
    english_punctuations = string.punctuation
    punctuations_list = english_punctuations
    translator = str.maketrans('', '', punctuations_list)
    X['text']=X['text'].str.translate(translator)
    return X

### step 5: remove repeating characters

In [None]:
class CleaningRepChars(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    X['text'] = X['text'].apply(lambda a: re.sub("(.)\\1{2,}", "\\1", a))
    return X

### step 6: clean and remove URLS


In [None]:
class RemoveURLs(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    X['text'] = X['text'].apply(lambda a: re.sub('((www.[^s]+)|(https?://[^s]+))',' ', a))
    return X

### step 7: remove numbers

In [None]:
class RemoveNumbers(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    X['text'] = X['text'].apply(lambda a: re.sub('[0-9]+', '', a))
    return X


### step 8: tokenization

In [None]:
class Tokenize(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    from nltk.tokenize import TreebankWordTokenizer
    tokenizer = TreebankWordTokenizer()
    X['text'] = X['text'].apply(tokenizer.tokenize)
    X['text'] = X['text'].astype(str)
    return X

### step 8: Lemmatizer


In [None]:
class Lemmatizer(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    import spacy
    nlp = spacy.load("nl_core_news_sm")
    X['text'] = X['text'].astype(str)
    X['text'] = X['text'].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))
    return X

In [None]:
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(LanguageFilter(), CoLo(), Cleaner(), RemovePunctuations(), CleaningRepChars(), RemoveURLs(), RemoveNumbers(),
                         Tokenize(), Lemmatizer())
pipeline

In [None]:
data_preprocessed = pipeline.fit_transform(data)

In [None]:
data_preprocessed

In [None]:
# old code snippet for 'stemming'. may come handy sometime later in the future.
# from nltk.stem.snowball import DutchStemmer

# st = DutchStemmer()
# def stemming_on_text(dataset):
#    text = [st.stem(word) for word in dataset]
#    return dataset
#data['text']= data['text'].apply(lambda x: stemming_on_text(x))
#data['text'].head()

## WordClouds

In [None]:
# Categorize data as positive and negative
data_pos = data[['full_text', 'sentiment_pattern']]
data_pos = data_pos[data_pos['sentiment_pattern'] == 1]

data_neg = data[['full_text', 'sentiment_pattern']]
data_neg = data_neg[data_neg['sentiment_pattern'] == 0]


### Word cloud for negative words

In [None]:
# Neg word cloud (attention: this uses the raw text data!)
# data_neg = data_neg[:9230]
plt.figure(figsize = (20,20))
wc_neg = WordCloud(max_words = 1000 , width = 1600 , height = 800,
               collocations=False).generate(" ".join(map(str, data_neg['full_text'])))
plt.imshow(wc_neg)


### Word cloud for positive words

In [None]:
# pos wordcloud (attention: this uses the raw text data!)
# data_pos = data_pos[:17789]
plt.figure(figsize = (20,20))
wc_pos = WordCloud(max_words = 1000 , width = 1600 , height = 800,
               collocations=False).generate(" ".join(map(str, data_pos['full_text'])))
plt.imshow(wc_pos)


## Modelling

## Prepare the data

In [None]:
# create the 'X' with the features, and create the'y' with the annotations/sentiment
X = data_preprocessed['text']
y = data_preprocessed['label']

In [None]:
# Split the data into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 42)

In [None]:
# Turn data into strings (ensures, otherwise may throw an error)
X_train = X_train.astype(str)
X_test = X_test.astype(str)

### Fit the TF-IDF Vectorizer

In [None]:
# Fit the tf-idf vectorizer on the training data (!)
vectoriser = TfidfVectorizer(ngram_range=(1, 2), max_features = 500000)
vectoriser.fit(X_train)

In [None]:
# Check how many feature words are extracted
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))

### Transform the data according to TF-IDF vectorizer

In [None]:
# Transform the data (both train and test data!) using the tf-idf vectorizer
X_train = vectoriser.transform(X_train)
X_test = vectoriser.transform(X_test)

### Model Evaluation

In [None]:
# Define a function for model evaluation
# creds to https://www.analyticsvidhya.com/blog/2021/06/twitter-sentiment-analysis-a-nlp-use-case-for-beginners/
def model_Evaluate(model):
    # Predict values for Test dataset
    y_pred = model.predict(X_test)
    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)
    categories = ['Negative','Positive']
    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]
    labels = [f'{v1}n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
    xticklabels = categories, yticklabels = categories)
    plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
    plt.ylabel("Actual values" , fontdict = {'size':14}, labelpad = 10)
    plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)

### Model 1: Train and evaluate a Bernoulli Naive Bayes model

In [None]:
BNBmodel = BernoulliNB()
BNBmodel.fit(X_train, y_train)
model_Evaluate(BNBmodel)
y_pred1 = BNBmodel.predict(X_test)

In [None]:
# Define a function for receiving a ROC curve and RUN it.
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred1)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC CURVE')
plt.legend(loc="lower right")
plt.show()

### Model 2: Train and evaluate a Linear Support Vector Classification model

In [None]:
SVCmodel = LinearSVC()
SVCmodel.fit(X_train, y_train)
model_Evaluate(SVCmodel)
y_pred2 = SVCmodel.predict(X_test)

In [None]:
# Define a function for receiving a ROC curve and RUN it.
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred2)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC CURVE')
plt.legend(loc="lower right")
plt.show()

### Model 3: Train and evaluate a Logistic regression model

In [None]:
LRmodel = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
LRmodel.fit(X_train, y_train)
model_Evaluate(LRmodel)
y_pred3 = LRmodel.predict(X_test)

In [None]:
# Define a function for receiving a ROC curve and RUN it.
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_pred3)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC CURVE')
plt.legend(loc="lower right")
plt.show()

### Overall Interpretation:

In [None]:
# The linear support vector classifier (svc) worked the best