# Importing Libraries for Data Cleaning and Labeling

In [1]:
import pandas as pd
import numpy as np
import json
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.notebook import tqdm
from tqdm import tqdm
from ipywidgets import IntProgress
import seaborn as sns

import re
from functools import partial
from collections import Counter
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

# Importing Datasets

before heading through the data cleaning and labeling, we need to import the datasets and merging them together. The total number of 250K tweets were extracted from twitter in which about 100K of those are not available at the moment of this analysis. This is due to the fact the data that we are working on, are originally from 2015. Since that time, some tweets might be deleted, the some user accounts might get closed which cause a null return from Twitter API. We initially replace those values with 0 in the raw dataset. This makes the process of dropping those rows easier as follow.
Also the process parliamentarian dataset is imported for labeling the data.

In [57]:
# backup files from twitter API
# Merging all the collected tweets into one DataFrame
Tweets_1 = pd.read_csv('Data/Tweets_1.csv')
Tweets_2 = pd.read_csv('Data/Tweets_2.csv')
Tweets_3 = pd.read_csv('Data/Tweets_3.csv')
Tweets_4 = pd.read_csv('Data/Tweets_4.csv')

df = Tweets_1.append(Tweets_2, ignore_index= True)
df = df.append(Tweets_3, ignore_index= True)
df = df.append(Tweets_4, ignore_index= True)
df.drop('Unnamed: 0', axis = 1, inplace=True)

# Removing the deleted, or generally not available tweets from the Dataset
df = df[df['Full Text'] != '0']


# parliamentarian info and their parties for 
parties_final = pd.read_csv('Data/parties_final.csv')

  df = Tweets_1.append(Tweets_2, ignore_index= True)
  df = df.append(Tweets_3, ignore_index= True)
  df = df.append(Tweets_4, ignore_index= True)


# Split the dataset into train and test sets

Before proceed with any data cleaning process, we wish to exclude a 20% portion of the total data we have to have them separate as a final test set predict the labels and sentiment. This can be done

In [58]:
df_test = df.sample(frac = 0.20, replace = False, random_state=10)
df_train = df.drop(df_test.index)

df_test = df_test.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)

In [59]:
df_train.head()

Unnamed: 0,Tweet ID,Tweet Date,Full Text,Likes_count,Retweet_count,Author name,Author ID,Author Follower,Author Friends,Retweet_status
0,641648154945961984,2015-09-09 16:23:36+00:00,What the heck is going on with the NDP? Doesn'...,1,0,Lynn Carleton,2811644853,1002,381,False
1,655926104289619972,2015-10-19 01:59:04+00:00,RT @APLawToronto: It's not how you start that ...,0,11,Ronnie (Wear a Mask!) Zwierz 🇨🇦,2375728898,3487,339,False
2,643417181410193408,2015-09-14 13:33:04+00:00,Meet @SheriRBenson @NDP_HQ candidate 4 #Saskat...,3,3,ProudPolitics Canada,577727470,1732,1019,False
3,654093831542059009,2015-10-14 00:38:16+00:00,RT @MikeHudema: World now talking about #Harpe...,0,75,Wayne Johnston,550253210,5248,5391,False
4,637043425972371456,2015-08-27 23:26:03+00:00,RT @joeldey: Spotted at Quebec and Terminal in...,0,36,John Coulbourn,270491583,848,806,False


In [77]:
# Get a backup of the train set
df_train.to_csv('Data/df_train_raw.csv')

In [78]:
# Get a backup of the test set
df_test.to_csv('Data/df_test_raw.csv')

# Data Cleaning

the published tweets on Twitter can contain lots of unwanted and unnecessary information for the analysis in this project. some of these information come directly from the tweets and some other are the handlers and other notations that gets visible while extracting the data twitter API .some of them includes the following:
- Lowercase the texts
- twitter return handles
- URL links 
- official hashtags including `#elxn42`

the following code explain the procedure to deal with the mentioned issues. one other important step that is added is to get back the words to their root format (e.g. helping --> help, etc.) to both reduce the dimentionality of the dataset after tokenizing and also help the process of data labeling


In [60]:
# General function for removing unnecessary information that is used in the following functions
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i,'', input_txt)
    return input_txt

def clean_tweets(tweets):
    #remove twitter Return handles (RT @xxx:)*
    tweets = np.vectorize(remove_pattern)(tweets, "RT @[\w]*:")

    #remove URL links (httpxxx)*
    tweets = np.vectorize(remove_pattern)(tweets, "https?://[A-Za-z0-9./]*")
    
    #remove official hashtags including elxn42
    tweets = np.core.defchararray.replace(tweets, "#elxn42", "")
    tweets = np.core.defchararray.replace(tweets, "#cdnpoli", "")
    tweets = np.core.defchararray.replace(tweets, "#canada", "")

    return tweets

def lemmatize_text(tweets):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(tweets)]

def collapse_list_to_string(string_list):
    # This is to join back together the text data into a single string
    return ' '.join(string_list)

Text lemmatization is the process of reducing a word to its base or dictionary form, known as a lemma. The base form or lemma of a word is its most basic, canonical or meaningful form. For example, the lemma of the word "running" is "run", and the lemma of the word "mice" is "mouse".

Lemmatization is often used in natural language processing (NLP) to normalize text and reduce the complexity of words, so that they can be analyzed or compared more easily. It is different from stemming, which is another technique to reduce words to their base form, but does so by simply removing the suffixes from a word, without taking into account the part of speech or context.

Lemmatization involves using a linguistic rule-based system to analyze a word and determine its lemma, based on the word's part of speech (noun, verb, adjective, adverb, etc.) and context. This can involve looking up the word in a dictionary, applying morphological analysis or using machine learning algorithms to predict the correct lemma.

Overall, lemmatization is a powerful tool in natural language processing and is used in various applications such as text classification, information retrieval, sentiment analysis, and more.

Below is the cleaned dataset which are stored in a new column called `Full Text2`.

In [61]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()


df_train['Full Text'] = df_train['Full Text'].str.lower()
df_train['Full Text2'] = clean_tweets(df_train['Full Text'])
df_train['Full Text2'] = df_train['Full Text2'].astype(str).apply(lemmatize_text)
df_train['Full Text2'] = df_train['Full Text2'].apply(collapse_list_to_string)
df_train.reset_index(drop=True, inplace=True)

In [62]:
# Adding a column for political party to the data set with none value
# The none values will be replaced in multiple steps until the end of this notebook
df_train['party'] = 'none'

# make three datasets for liberal, conservative and ndp names from the parliament members dataset
liberal_party = parties_final[parties_final['party'] == 'liberal']
conservative_party = parties_final[parties_final['party'] == 'conservative']
ndp_party = parties_final[parties_final['party'] == 'ndp']

The following cell will check every row of our original dataset with the set of names of the above defined datasets for political parties. If any of the tweet texts contains a word from any of the above datasets, then the assumption is that the tweet is talking about that specific party. E.g. having the name **justintrudeau** means the tweet is related to liberal party.
In this way we can use the labeled tweets as another training set to fit a model, and that model predict the political party label of the unlabeled tweets. In other word, we get the help of a semi-supervised learning system to cathegorize our dataset based on the political parties.

**The below code can take at least 10 Hours to run. For the ease of analysis, the result of the following cell is saved and provided in the Data folder of this project**

In [63]:
# This cell will take couple of hours to run

for i in range(df_train.shape[0]):
    if any(ele in df_train.loc[i,'Full Text'] for ele in liberal_party['Name']):
        df_train.loc[i,'party'] = 'liberal'
    elif any(ele in df_train.loc[i,'Full Text'] for ele in conservative_party['Name']):
        df_train.loc[i,'party'] = 'conservative'
    elif any(ele in df_train.loc[i,'Full Text'] for ele in ndp_party['Name']):
        df_train.loc[i,'party'] = 'ndp'

In [64]:

# Save a Backup of the preprocessing to the folder
df_train.to_csv('Data/df_train_labeled.csv')

In [2]:
# Can load the backup file and continue from here on
df_train_labeled = pd.read_csv('Data/df_train_labeled.csv')
df_train_labeled.drop('Unnamed: 0', axis = 1, inplace=True)

# TF-IDF Vectorizer and Model Training

TF-IDF (Term Frequency-Inverse Document Frequency) is a numerical statistic that is used to reflect the importance of a word in a document or corpus (collection of documents). It is a popular technique used in text mining, information retrieval, and natural language processing.

TF-IDF is calculated by multiplying two values: term frequency (TF) and inverse document frequency (IDF).
- Term frequency (TF) is the number of times a word (term) appears in a document. The more times a word appears in a document, the more important it is assumed to be in that document.

- Inverse document frequency (IDF) is a measure of how important a word is in the entire corpus. IDF is calculated by taking the logarithm of the ratio of the total number of documents in the corpus to the number of documents containing the word. The less frequently a word appears in the corpus, the higher its IDF value.

The TF-IDF vectorizer is useful for text classification and other natural language processing tasks because it can convert raw text data into a format that can be used by machine learning algorithms to make predictions. By using TF-IDF scores to represent text documents, machine learning algorithms can more easily identify patterns and relationships between words, and use these to make accurate predictions.

Ensemble learning is a machine learning technique that involves combining multiple models to improve prediction accuracy and robustness. Instead of relying on a single model to make predictions, ensemble learning involves training multiple models on the same dataset and then aggregating their predictions in some way to make a final prediction.

There are several different types of ensemble learning techniques, but two of the most common are:

- Bagging: In bagging (short for bootstrap aggregating), multiple models are trained on different subsets of the data, and their predictions are averaged to make a final prediction. Bagging is commonly used with decision trees, where each model is trained on a random subset of the features, to reduce overfitting.

- Boosting: In boosting, multiple weak models are trained sequentially, with each new model learning from the errors of the previous model. The final prediction is then based on a weighted average of the predictions of all the models. Boosting is commonly used with decision trees, neural networks, and other models.

Ensemble learning can lead to significant improvements in prediction accuracy, especially when used with weak models that are prone to overfitting or underfitting. It can also help to improve model robustness by reducing the impact of individual model errors or biases.

Here we are going with the **Bagging** approach for our label prediction.

**Be careful with running the following code, it will take over 6 hours depending on your computation power, the saved model is provided in the Model folder for your ease**

In [41]:
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
classes = df_train_labeled['party'].values
import pandas as pd

# Split the X and y (target)
X = df_train_labeled[(df_train_labeled['Full Text2'].notna()) & (df_train_labeled['party'] != 'none')]['Full Text2']
y = df_train_labeled[(df_train_labeled['Full Text2'].notna()) & (df_train_labeled['party'] != 'none')]['party']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words = "english")

# Vectorize the training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Define the classifiers
rf = RandomForestClassifier(random_state=42)
mnb = MultinomialNB()
gnb = GaussianNB()

# Define the parameters for GridSearchCV for each classifier
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [20, 50, 100]
}

mnb_params = {
    'alpha': [0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

gnb_params = {
    'var_smoothing': [1e-6, 1e-5, 1e-4]
}

# Define the GridSearchCV object for each classifier
rf_grid = GridSearchCV(rf, rf_params, cv=5)
mnb_grid = GridSearchCV(mnb, mnb_params, cv=5)
gnb_grid = GridSearchCV(gnb, gnb_params, cv=5)

# Fit the GridSearchCV object to the training data for each classifier
rf_grid.fit(X_train_tfidf, y_train)
mnb_grid.fit(X_train_tfidf, y_train)
gnb_grid.fit(X_train_tfidf.toarray(), y_train)

# Print the best parameters and best score for each classifier
print('Random Forest: Best parameters:', rf_grid.best_params_)
print('Random Forest: Best score:', rf_grid.best_score_)
print('Multinomial Naive Bayes: Best parameters:', mnb_grid.best_params_)
print('Multinomial Naive Bayes: Best score:', mnb_grid.best_score_)
print('Gaussian Naive Bayes: Best parameters:', gnb_grid.best_params_)
print('Gaussian Naive Bayes: Best score:', gnb_grid.best_score_)

# Get the best estimator for each classifier
best_rf = rf_grid.best_estimator_
best_mnb = mnb_grid.best_estimator_
best_gnb = gnb_grid.best_estimator_

# Define the voting classifier
voting_clf = VotingClassifier(estimators=[('rf', best_rf), ('mnb', best_mnb), ('gnb', best_gnb)])

# Get the cross validation score of the voting classifier
scores = cross_val_score(voting_clf, tfidf.transform(X).todense(), y, cv=5)

# Print the cross validation scores
print('Cross validation scores:', scores)

# Fit the voting classifier to the training data
voting_clf.fit(X_train_tfidf.todense(), y_train)

# Make predictions on the test data
y_pred = voting_clf.predict(tfidf.transform(X_test).todense())

# Print the classification report and accuracy score
print('Classification report:\n', classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))

Random Forest, Multinomial Naive Bayes, and Gaussian Naive Bayes are all popular algorithms used in ensemble learning for data labeling. Each of these algorithms has unique characteristics that make it suitable for different types of data and applications.

#### Random Forest:
Random Forest is a type of decision tree-based algorithm that uses an ensemble of decision trees to make predictions. Each decision tree is trained on a random subset of the features and data, making it less prone to overfitting and more robust to outliers. Random Forest can be used for both classification and regression tasks and has proven to be an effective algorithm for a wide range of applications.

#### Multinomial Naive Bayes:
Multinomial Naive Bayes is a probabilistic algorithm that is particularly suited for text classification tasks. It is based on Bayes' theorem and assumes that the probability of each feature value is conditionally independent of all other features given the class variable. This assumption allows the algorithm to work well with high-dimensional datasets such as text data.

#### Gaussian Naive Bayes:
Gaussian Naive Bayes is a variant of Naive Bayes that is used for continuous data. It assumes that the features are normally distributed and independent of each other given the class variable. Gaussian Naive Bayes can be used for both classification and regression tasks and is particularly suited for applications with continuous data such as sensor data or image data.

In an ensemble learning setting, Gaussian Naive Bayes can be combined with other algorithms to improve the accuracy and robustness of the model. One common approach is to use Gaussian Naive Bayes as a base model and combine it with other models such as Random Forest or Multinomial Naive Bayes to create a more diverse ensemble.

Overall, the choice of algorithm(s) for ensemble learning depends on the specific characteristics of the dataset and the application requirements. By combining multiple models, ensemble learning can help to improve the accuracy and robustness of the model, leading to better performance and more reliable predictions. The choice of these three algorithm has been made according to the research papers in this area.

The next step is to run the cross-validation score and accuracy test on the model

In [48]:
# with the following code, we have have access to the paramteres of each model in the voting classifier
best_rf = voting_clf.named_estimators.rf
best_mnb = voting_clf.named_estimators.mnb
best_gnb = voting_clf.named_estimators.gnb



In [49]:
#  We can see the cross validation score of the model through:
scores = cross_val_score(voting_clf, X_train_tfidf.todense(), y_train, cv=5)


print('Cross validation scores:', scores)





Cross validation scores: [0.92064195 0.91698949 0.91853901 0.91743221 0.91875138]






Cross validation scores: [0.91228556 0.90896514 0.90702822 0.91256226 0.91033624]


The score of a voting classifier represents its overall accuracy in predicting the correct class labels for a given set of data. This score is typically measured using a metric such as accuracy, precision, recall, or F1 score, depending on the specific requirements of the problem at hand. The score of a voting classifier can be used to evaluate its performance and compare it against other classifiers or models.

In [26]:
y_pred = voting_clf.predict(tfidf.transform(X_test).todense())

# Print the classification report and accuracy score
print('Classification report:\n', classification_report(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))



Classification report:
               precision    recall  f1-score   support

conservative       0.91      0.90      0.91      2593
     liberal       0.91      0.98      0.94      4764
         ndp       0.97      0.77      0.86      1678

    accuracy                           0.92      9035
   macro avg       0.93      0.88      0.90      9035
weighted avg       0.92      0.92      0.92      9035

Accuracy: 0.9182069728832318


The classification report shows the precision, recall, and F1-score of a classifier for each class. In this case, we have a three-class problem with precision scores of 0.91, 0.91, and 0.97 for the first, second, and third classes, respectively. This indicates that the classifier is quite accurate in predicting the correct class for each instance, particularly for the third class.

The recall scores are 0.90, 0.98, and 0.77 for the first, second, and third classes, respectively. This indicates that the classifier is able to correctly identify most instances of the second class, but is less effective at identifying instances of the first and third classes.

The F1-scores are 0.91, 0.94, and 0.86 for the first, second, and third classes, respectively. The F1-score is the harmonic mean of precision and recall, and provides a measure of the overall accuracy of the classifier for each class. The highest F1-score is obtained for the second class, indicating that the classifier is most effective at predicting this class.

Overall, the classification report suggests that the classifier is quite accurate in predicting the correct class labels for the second class, but is less effective for the first and third classes. The accuracy score of 0 indicates that the classifier did not correctly predict any instances, but this is likely due to the fact that the accuracy score was calculated on a specific subset of the data and may not be representative of the overall performance of the classifier. Further analysis and evaluation would be needed to fully assess the performance of the classifier for this three-class problem.

On the other hand, the best practice is to get the confusion matrix for a balanced amount of data through the classes.

In [40]:
from sklearn.metrics import classification_report, confusion_matrix


conf_matrix = confusion_matrix(y_test, y_pred)

print('Confusion Matrix:')
print(conf_matrix)

Confusion Matrix:
[[2328  247   18]
 [  68 4674   22]
 [ 152  232 1294]]


In order to make it easy to run the model at anytime and saving time, we use pickle library to get a export and backup of te trained model and its parameter and future use.

In [6]:
import pickle

pickle.dump(voting_clf, open("Model/model_clf_ensemble", "wb"))


In [7]:
pickle.dump(tfidf, open("Model/tfidf", "wb"))

As the model provide with good score and accuracy, we pass in the unlabeled tweets of the training set to categorize the political party that their are talking about.
Once this step is done, we will have train dataset (which is 80% of the total data that we initially collect) and we can move them for the sentimental analysis.

In [84]:
X_predict = df_train_labeled[(df_train_labeled['Full Text2'].notna()) & (df_train_labeled['party'] == 'none')]['Full Text2']

X_predict_tfidf=tfidf.transform(X_predict)

df_party_predict=voting_clf.predict(X_predict_tfidf.todense())






In [129]:
# temporarily splitting the the training data into labeled and not labeled
df_known_label = df_train_labeled[(df_train_labeled['Full Text2'].notna()) & (df_train_labeled['party'] != 'none')]
df_unknown_label = df_train_labeled[(df_train_labeled['Full Text2'].notna()) & (df_train_labeled['party'] == 'none')]
                                  

In [136]:
# adding the predicted y (political party label in the above cell df_party_predict) to the unlabeled data
counter = 0
for i in test.index:
    df_unknown_label.loc[i,'party'] = df_party_predict[counter]
    
    counter = counter + 1


In [139]:
# merging the the splitted dataset together to get the full training dataset again
df_train_for_transformer = df_known_label.append(df_unknown_label)

  df_train_for_transformer = df_known_label.append(df_unknown_label)


The last step is to save the data into a csv file to pass it to the Sentimental analysis notebook:

In [172]:
df_train_for_transformer = df_train_for_transformer.sort_index()
df_train_for_transformer.to_csv('Data/df_train_for_transformer.csv')

# Test Dataset prediction

In order to see of the model can provide a good estimate on the overal election results outcome, we feed the non-seen test set that we initially split from the raw dataset. All the visualizations are provided after the sentimental analysis in a different notebook. Here we only run the trained model on the test set.

In [152]:
df_test.head()

Unnamed: 0,Tweet ID,Tweet Date,Full Text,Likes_count,Retweet_count,Author name,Author ID,Author Follower,Author Friends,Retweet_status
0,656289970479611904,2015-10-20 02:04:56+00:00,RT @HuffPostCanada: #Elxn42\nEarly numbers in ...,0,21,RTCP24 NEWS,204195016,1230,540,False
1,631864164777070592,2015-08-13 16:25:31+00:00,RT @jcallaghan2: #Pierrefonds removing concret...,0,2,Jim Crowell,76333343,270,467,False
2,656312883849113600,2015-10-20 03:35:59+00:00,RT @JJVenky: After being called a ‘radical ide...,0,1,Freshwater fELA,632592632,2662,2302,False
3,645292291473567744,2015-09-19 17:44:05+00:00,Government should support private-sector inves...,0,0,The Sun,185269740,2251,380,False
4,656209902499725312,2015-10-19 20:46:47+00:00,I just voted..... did you? #canadavotes #cdnpo...,0,0,🦋 Jimmy Dean Levi 💎💎 💎,1087785944,1815,1576,False


We use the same TF-IDF vectorizer and model to label the test set and save it for the sentimental analysis

In [153]:
X_test = df_test['Full Text']

X_predict_test_tfidf=tfidf.transform(X_test)

df_test_party_predict=voting_clf.predict(X_predict_test_tfidf.todense())



In [160]:
df_test['party'] = df_test_party_predict

In [169]:
df_test.to_csv('Data/df_test_for_transformer.csv')

# Label prediction of the 43rd federal election dataset

It is desired to see if the whole process can predict the result of another Federal of Canada. Thus, the 43rd Federal election tweets that were extracted earlier from the twitter were also fed into the model and labeled using the same model.
 All the visualization are provided in another notebook after the sentimental analysis.

In [20]:
elxn43 = pd.read_csv('Data/test_elnx43.csv')

elxn43 = elxn43[elxn43['Full Text'] != '0']
elxn43.reset_index(drop = True, inplace = True)
elxn43.drop('Unnamed: 0', axis = 1, inplace=True)


In [8]:
# loading the model
tfidf = pickle.load(open("Model/tfidf", 'rb'))
voting_clf = pickle.load(open("Model/model_clf_ensemble", 'rb'))

In [22]:
# Predict the labels after vectorizing
X_test_elxn43 = elxn43['Full Text']

X_predict_test_tfidf_elxn43=tfidf.transform(X_test_elxn43)

df_test_party_predict_elxn43=voting_clf.predict(X_predict_test_tfidf_elxn43.todense())



In [23]:
elxn43['party'] = df_test_party_predict_elxn43

In [25]:
`elxn43.to_csv('Data/elxn43_labeled_for_transformer.csv')