## Basic Classification Pipeline (without nested cross-validation)

In [138]:
import numpy as np
import pandas as pd #to work with csv files

from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#matplotlib imports are used to plot confusion matrices for the classifiers
import matplotlib as mpl 
import matplotlib.cm as cm 
import matplotlib.pyplot as plt 

#import feature extraction methods from sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words

#pre-processing of text
import string
import re

#import classifiers from sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

#import different metrics to evaluate the classifiers
from sklearn.metrics import accuracy_score
#from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn import metrics

#import time function from time module to track the training duration
from time import time

In [139]:
tweets = pd.read_csv('COVID19_Dataset-text_labels_only.csv')

In [140]:
def clean_text(str_list, lemmatize = True):
    clean_list = []
    
    for text in str_list:
        # to drop pound sign from hash tags
        text = re.sub(r'#', '', text)
        words = word_tokenize(text)
        clean_words = []
        
        for word in words:            
            # drop words with fewer than 2 characters; drop any punctuation "words"
            if (len(word) > 1) and (re.match(r'^\w+$', word)):

                if lemmatize:
                    lemmatizer = WordNetLemmatizer()
            
                clean_words.append(word)
        clean_text = ' '.join(clean_words)
        clean_list.append(clean_text)
    
    return clean_list

In [141]:
# run on all tweets
tweets['clean_tweet'] = clean_text(tweets['Tweet'])

In [142]:
import sklearn
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split

#Step 1: train-test split
X = tweets['clean_tweet'] #the column text contains textual data to extract features from
y = tweets['Is_Unreliable'] #this is the column we are learning to predict. 

print(X.shape, y.shape)
# split X and y into training and testing sets. By default, it splits 75% training and 25% test
#random_state=1 for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(560,) (560,)
(420,) (420,)
(140,) (140,)


In [144]:
# Vectorize train and test data
vect = CountVectorizer(lowercase = True,
                        stop_words = 'english',
                        ngram_range = (1,1)) #instantiate a vectoriezer
X_train_dtm = vect.fit_transform(X_train)#use it to extract features from training data
#transform testing data (using training data's features)

# During transforming, the default behavior of CountVectorizer is to ignore words that 
# were not observed during fitting. 

X_test_dtm = vect.transform(X_test)
print(X_train_dtm.shape, X_test_dtm.shape)
#i.e., the dimension of our feature vector is 1994

(420, 1975) (140, 1975)


In [145]:
# Train the classifier and predict for test data
nb = MultinomialNB() #instantiate a Multinomial Naive Bayes model
%time nb.fit(X_train_dtm, y_train)#train the model(timing it with an IPython "magic command")
#calculate evaluation measures:
preds = nb.predict(X_test_dtm)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, preds))

Wall time: 2 ms
              precision    recall  f1-score   support

           0       0.78      0.77      0.77        69
           1       0.78      0.79      0.78        71

    accuracy                           0.78       140
   macro avg       0.78      0.78      0.78       140
weighted avg       0.78      0.78      0.78       140



In [146]:
from sklearn.linear_model import LogisticRegression #import

logreg = LogisticRegression(class_weight="balanced") #instantiate a logistic regression model
logreg.fit(X_train_dtm, y_train) #fit the model with training data

#calculate evaluation measures:
preds = logreg.predict(X_test_dtm) #Make predictions on test data
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.80      0.74      0.77        69
           1       0.76      0.82      0.79        71

    accuracy                           0.78       140
   macro avg       0.78      0.78      0.78       140
weighted avg       0.78      0.78      0.78       140



In [147]:
from sklearn.svm import LinearSVC

classifier = LinearSVC(class_weight='balanced') #instantiate a logistic regression model
classifier.fit(X_train_dtm, y_train) #fit the model with training data

#calculate evaluation measures:
preds = logreg.predict(X_test_dtm) #Make predictions on test data
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.80      0.74      0.77        69
           1       0.76      0.82      0.79        71

    accuracy                           0.78       140
   macro avg       0.78      0.78      0.78       140
weighted avg       0.78      0.78      0.78       140

