In [1]:
import re
import string
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')

# Removing @Mentions, URL links, Punctuations and numbers
id_remove = lambda x: re.sub(r'@\w+', '', x.lower())
url_remove = lambda x: re.sub('https?://[A-Za-z0-9./]+','',x)
alphanumeric = lambda x: re.sub(r"""\w*\d\w*""", ' ', x)
punc = lambda x: x.translate(str.maketrans('', '', string.punctuation))

# Removing stop words and stem the words
def normalize_corpus(corpus,text_stemming=False, stopword_removal=True, text_lemmatize=True, dict_check=True):
    from nltk.tokenize.toktok import ToktokTokenizer
    from nltk.stem import WordNetLemmatizer
    stopwords = nltk.corpus.stopwords.words('english')
    tokenizer = ToktokTokenizer()
    lemmatizer = WordNetLemmatizer()
    words = set(nltk.corpus.words.words())
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # remove extra newlines
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))
        
        # stem words
        if text_stemming:
            ps = nltk.stem.snowball.SnowballStemmer('english')
            doc = ' '.join([ps.stem(word) for word in doc.split()])
         
        # lemmatize words
        if text_lemmatize:
            doc=' '.join([lemmatizer.lemmatize(word) for word in doc.split()])

        # check if the words exist in english dictionary
        if dict_check:
            doc=" ".join(w for w in nltk.wordpunct_tokenize(doc) \
                if w.lower() in words or not w.isalpha())
            
        # remove stop words
        if stopword_removal:
            tokens = tokenizer.tokenize(doc)
            tokens = [token.strip() for token in tokens]
            filtered_tokens = [token for token in tokens if token not in stopwords]
            doc = ' '.join(filtered_tokens) 
            
        normalized_corpus.append(doc)
   
    return normalized_corpus

# This function deletes empty documents after cleaning
def delete_zero(df):
    zero=[]
    for i in df.index:
        if len(df.text[i])==0:
            zero.append(i)
    df=df.drop(zero)
    return df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [35]:
from sklearn.datasets import fetch_20newsgroups
# Preparing train & test data for 20 news group 
newsgroups_train = fetch_20newsgroups(subset='train', remove = ('headers', 'footers', 'quotes'))
newsgroups_train = pd.DataFrame({'text': newsgroups_train.data, 'target': newsgroups_train.target})
newsgroups_test = fetch_20newsgroups(subset='test', remove = ('headers', 'footers', 'quotes'))
newsgroups_test = pd.DataFrame({'text': newsgroups_test.data, 'target': newsgroups_test.target})

# Removing numbers, upper case letters and punctuations
newsgroups_train['text'] = newsgroups_train.text.map(id_remove).map(url_remove).map(alphanumeric).map(punc)
newsgroups_test['text'] = newsgroups_test.text.map(id_remove).map(url_remove).map(alphanumeric).map(punc)

# Cleaning texts
newsgroups_train['text']=normalize_corpus(newsgroups_train.text)
newsgroups_test['text']=normalize_corpus(newsgroups_test.text)

newsgroups_train=delete_zero(newsgroups_train)
newsgroups_test=delete_zero(newsgroups_test)

In [20]:
# Preparing train & test data for Sentiment140
sentiment_train = pd.read_csv("training.1600000.processed.noemoticon.csv",
                names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                encoding='latin-1')
sentiment_train = sentiment_train.drop(columns=['id', 'date', 'query', 'user'])
sentiment_train.polarity = sentiment_train.polarity.replace({0: 0, 4: 1})

sentiment_test = pd.read_csv("new_testdata.manual.2009.06.14.csv",
                names=['polarity', 'id', 'date', 'query', 'user', 'text'],
                encoding='latin-1')
sentiment_test = sentiment_test.drop(columns=['id', 'date', 'query', 'user'])
sentiment_test.polarity = sentiment_test.polarity.replace({0: 0, 4: 1})

# sampling
sentiment_train = sentiment_train.sample(n=50000)

# Cleaning Data
sentiment_train['text'] = sentiment_train['text'].astype('str')
sentiment_test['text'] = sentiment_test['text'].astype('str')
sentiment_train['text'] = sentiment_train.text.map(id_remove).map(url_remove).map(alphanumeric).map(punc)
sentiment_test['text'] = sentiment_test.text.map(id_remove).map(url_remove).map(alphanumeric).map(punc)
sentiment_train['text']=normalize_corpus(sentiment_train.text)
sentiment_test['text']=normalize_corpus(sentiment_test.text)
sentiment_train=delete_zero(sentiment_train)
sentiment_test=delete_zero(sentiment_test)

In [3]:
class GaussianNaiveBayes:
    
    def __init__(self):
        return
    
    def fit(self, x, y):
        N, D = x.shape
        C = np.max(y) + 1
        mu,mu2, sigma = np.zeros((C,D)), np.zeros((C,D)), np.zeros((C,D))
        Nc = np.zeros(C) 
        for c in range(C):
            x_c = x[y == c]                           
            Nc[c] = x_c.shape[0]                      
            x_c_squared = x_c.copy()
            x_c_squared.data **= 2
            for ic in range(D):               
                mu[c,ic] = (x_c[:,ic].sum()+1) / (Nc[c]+C)
                mu2[c,ic]=((x_c_squared[:,ic]).sum()+1) / (Nc[c]+C)
                sigma[c,ic]=np.sqrt(mu2[c,ic]-mu[c,ic]**2)
        self.mu = mu                                  
        self.sigma = sigma                         
        self.pi =(Nc+1)/(N+C)
        return self


def logsumexp(Z):                                                
    Zmax = np.max(Z,axis=0)[None,:]                              
    log_sum_exp = Zmax + np.log(np.sum(np.exp(Z - Zmax), axis=0))
    return log_sum_exp

def predict(self, xt):
    Nt, D = xt.shape
    log_prior = np.log(self.pi)[:, None] 
    C,Dt=self.mu.shape
    log_likelihood=np.zeros((C,Nt))
    for i in range(Nt):
          xt_i=xt.tocsr()[i,:].todense()
          for j in range(C):
              log_liklihood_i=-.5 * np.log(2*np.pi) - np.log(self.sigma[j,:]) -.5 * (np.power((xt_i - self.mu[j,:])/self.sigma[j,:],2))
              log_likelihood[j,i] = np.sum(log_liklihood_i)
    log_posterior = log_prior + log_likelihood
    posterior = np.exp(log_posterior - logsumexp(log_posterior))
    y_pred = np.argmax(posterior.T, 1)
    return posterior.T,y_pred
    
GaussianNaiveBayes.predict = predict

In [4]:
class MultinomialNaiveBayes:
    def __init__(self, alpha=1):
        self.alpha=alpha
        return
    
    def fit(self, x, y):
        N, D = x.shape
        C = np.max(y) + 1
        self.theta = np.zeros((C,D))
        self.prior = np.zeros(C)
        for c in range(C):
            x_c = x[y == c]
            self.theta[c,:] = (np.sum(x_c, 0) + self.alpha) / (np.sum(x_c) + self.alpha * D)
            self.prior[c] = x_c.shape[0] / len(y)
        return self
    
    def predict(self, x_test): 
        N = x_test.shape[0]
        log_prior = np.log(self.prior)[:, None]
        C,Dt=self.theta.shape
        log_likelihood=np.zeros((C,N))
        log_theta=np.log(self.theta)
        for i in range(N):
             xti=x_test.tocsr()[i,:].todense()
             for j in range(C):
                 log_likelihood[j,i] = np.dot(xti,log_theta[j,:])
        log_posterior = log_prior + log_likelihood
        posterior = log_posterior
        y_pred = np.argmax(posterior.T, 1)
        return posterior.T, y_pred 

In [5]:
def evaluate_acc( x_test, y_test, y_pred):
  accuracy = np.sum(y_pred == y_test)/y_test.shape[0]
  TP1=np.where(y_test==1 )
  TP2=np.where(y_pred == y_test)
  TP=np.intersect1d(TP1,TP2)
  FN1=np.where(y_test==1 )
  FN2=np.where(y_pred==0 )
  FN=np.intersect1d(FN1,FN2)
  FP1=np.where(y_test==0 )
  FP2=np.where(y_pred==1 )
  FP=np.intersect1d(FP1,FP2)
  TN1=np.where(y_test==0 )
  TN2=np.where(y_pred==0 )
  TN=np.intersect1d(TN1,TN2)
  recall = TP.shape[0]/(FN.shape[0]+TP.shape[0]+0.001)
  precision=TP.shape[0]/(FP.shape[0]+TP.shape[0]+0.001)
  Selectivity=TN.shape[0]/(TN.shape[0]+FP.shape[0]+0.001)
  N_predictive_v=TN.shape[0]/(TN.shape[0]+FN.shape[0]+0.001)
  return accuracy,recall,precision,Selectivity,N_predictive_v

In [18]:
sentiment_train1=sentiment_train.sample(n=1000)

vectorizer = TfidfVectorizer(max_df=0.95, min_df=5)
vectors_train = vectorizer.fit_transform(sentiment_train1.text)
vectors_test = vectorizer.transform(sentiment_test.text)

x_train, y_train = vectors_train, sentiment_train1.polarity.to_numpy()
x_test, y_test= vectors_test, sentiment_test.polarity.to_numpy()
model = GaussianNaiveBayes()
model.fit(x_train, y_train)
y_prob, y_pred = model.predict(x_test)
accuracy,_,_,_,_=evaluate_acc(x_test,y_test,y_pred)
print(f'test accuracy: {accuracy}')

confusion_word_inex=[]
Nt,D=np.shape(vectors_train)
for i in range(D):
    x_train, y_train = vectors_train.tocsr()[:,i], sentiment_train1.polarity.to_numpy()
    x_test, y_test= vectors_test.tocsr()[:,i], sentiment_test.polarity.to_numpy()
    model = GaussianNaiveBayes()
    model.fit(x_train, y_train)
    y_prob, y_pred = model.predict(x_test)
    accuracy,_,_,_,_ = evaluate_acc(x_test,y_test,y_pred)
    if accuracy<0.5:
        confusion_word_inex.append(i)
        #print('yes')
feature_word=vectorizer.get_feature_names()    
Confusion_word=[]
Dc=len(confusion_word_inex)
for i in range(Dc):
    kkk=confusion_word_inex[i]
    Confusion_word.append(feature_word[confusion_word_inex[i]])
def clean_confusion(corpus,Confusion_word):
    clean_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        doc1=''
        for word in doc.split():
            if word not in Confusion_word:
                doc1=doc1+' '+word
        clean_corpus.append(doc1)
    return clean_corpus
            
print(len(Confusion_word))

sentiment_train['text']=clean_confusion(sentiment_train.text,Confusion_word)
sentiment_test['text']=clean_confusion(sentiment_test.text,Confusion_word)

def delete_zero(df):
    zero=[]
    for i in df.index:
        if len(df.text[i])<2:
            zero.append(i)
    df=df.drop(zero)
    return df

sentiment_test=delete_zero(sentiment_test)
sentiment_train=delete_zero(sentiment_train)


vectorizer = TfidfVectorizer(max_df=0.95, min_df=5)
vectors_train = vectorizer.fit_transform(sentiment_train.text)
vectors_test = vectorizer.transform(sentiment_test.text)


    
x_train, y_train = vectors_train, sentiment_train.polarity.to_numpy()
x_test, y_test= vectors_test, sentiment_test.polarity.to_numpy()
model = GaussianNaiveBayes()
model.fit(x_train, y_train)
y_prob, y_pred = model.predict(x_test)

accuracy,_,_,_,_ = evaluate_acc(x_test,y_test,y_pred)
print(f'test accuracy: {accuracy}')


test accuracy: 0.6836158192090396




122
test accuracy: 0.7082152974504249


In [21]:
sentiment_train1=sentiment_train.sample(n=1000)

vectorizer = CountVectorizer(max_df=0.95, min_df=5)
vectors_train = vectorizer.fit_transform(sentiment_train1.text)
vectors_test = vectorizer.transform(sentiment_test.text)

x_train, y_train = vectors_train, sentiment_train1.polarity.to_numpy()
x_test, y_test= vectors_test, sentiment_test.polarity.to_numpy()
model = MultinomialNaiveBayes()
model.fit(x_train, y_train)
y_prob, y_pred = model.predict(x_test)
accuracy,_,_,_,_=evaluate_acc(x_test,y_test,y_pred)
print(f'test accuracy: {accuracy}')

confusion_word_inex=[]
Nt,D=np.shape(vectors_train)
for i in range(D):
    x_train, y_train = vectors_train.tocsr()[:,i], sentiment_train1.polarity.to_numpy()
    x_test, y_test= vectors_test.tocsr()[:,i], sentiment_test.polarity.to_numpy()
    model = GaussianNaiveBayes()
    model.fit(x_train, y_train)
    y_prob, y_pred = model.predict(x_test)
    accuracy,_,_,_,_ = evaluate_acc(x_test,y_test,y_pred)
    if accuracy<0.5:
        confusion_word_inex.append(i)
        #print('yes')
feature_word=vectorizer.get_feature_names()    
Confusion_word=[]
Dc=len(confusion_word_inex)
for i in range(Dc):
    kkk=confusion_word_inex[i]
    Confusion_word.append(feature_word[confusion_word_inex[i]])
def clean_confusion(corpus,Confusion_word):
    clean_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        doc1=''
        for word in doc.split():
            if word not in Confusion_word:
                doc1=doc1+' '+word
        clean_corpus.append(doc1)
    return clean_corpus
            
print(len(Confusion_word))

sentiment_train['text']=clean_confusion(sentiment_train.text,Confusion_word)
sentiment_test['text']=clean_confusion(sentiment_test.text,Confusion_word)

def delete_zero(df):
    zero=[]
    for i in df.index:
        if len(df.text[i])<2:
            zero.append(i)
    df=df.drop(zero)
    return df

sentiment_test=delete_zero(sentiment_test)
sentiment_train=delete_zero(sentiment_train)


vectorizer = CountVectorizer(max_df=0.95, min_df=5)
vectors_train = vectorizer.fit_transform(sentiment_train.text)
vectors_test = vectorizer.transform(sentiment_test.text)


    
x_train, y_train = vectors_train, sentiment_train.polarity.to_numpy()
x_test, y_test= vectors_test, sentiment_test.polarity.to_numpy()
model = MultinomialNaiveBayes()
model.fit(x_train, y_train)
y_prob, y_pred = model.predict(x_test)

accuracy,_,_,_,_ = evaluate_acc(x_test,y_test,y_pred)
print(f'test accuracy: {accuracy}')

test accuracy: 0.6751412429378532




113
test accuracy: 0.7840909090909091


In [39]:

vectorizer = CountVectorizer(max_df=900,min_df=50)
newsgroups_train1=newsgroups_train.sample(n=1500)
newsgroups_test1=newsgroups_test.sample(n=1000)

vectors_train = vectorizer.fit_transform(newsgroups_train1.text)
vectors_test = vectorizer.transform(newsgroups_test1.text)

x_train, y_train = vectors_train, newsgroups_train1.target.to_numpy()
x_test, y_test= vectors_test, newsgroups_test1.target.to_numpy()
model_MNB = MultinomialNaiveBayes()
model_MNB.fit(x_train, y_train)
y_prob_MNB, y_pred_MNB = model_MNB.predict(x_test)
accuracy_MNB,recall_MNB,precision_MNB,Selectivity_MNB,N_predictive_v_MNB=evaluate_acc(x_test, y_test, y_pred_MNB)
print(f'test accuracy: {accuracy_MNB}') 

confusion_word_inex=[]
Nt,D=np.shape(vectors_train)
for i in range(D):
    x_train, y_train = vectors_train.tocsr()[:,i], newsgroups_train1.target.to_numpy()
    x_test, y_test= vectors_test.tocsr()[:,i], newsgroups_test1.target.to_numpy()
    model = MultinomialNaiveBayes()
    model.fit(x_train, y_train)
    y_prob, y_pred = model.predict(x_test)
    #y_pred = np.argmax(y_prob, 1)
    accuracy = np.sum(y_pred == y_test)/y_pred.shape[0]
    if accuracy< 0.1:
        confusion_word_inex.append(i)
        #print('yes')
        
feature_word=vectorizer.get_feature_names()    
Confusion_word=['dd']
Dc=len(confusion_word_inex)
for i in range(Dc):
    kkk=confusion_word_inex[i]
    Confusion_word.append(feature_word[confusion_word_inex[i]])
def clean_confusion(corpus,Confusion_word):
    clean_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        doc1=''
        for word in doc.split():
            if word not in Confusion_word:
                doc1=doc1+' '+word
        clean_corpus.append(doc1)
    return clean_corpus
            
#%%
newsgroups_train['text']=clean_confusion(newsgroups_train.text,Confusion_word)
newsgroups_test['text']=clean_confusion(newsgroups_test.text,Confusion_word)


def delete_zero(df):
    zero=[]
    for i in df.index:
        if len(df.text[i])<2:
            zero.append(i)
    df=df.drop(zero)
    return df

newsgroups_test2=delete_zero(newsgroups_test)
newsgroups_train=delete_zero(newsgroups_train)




from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(max_df=900, min_df=5)
count_vectors_train = count_vectorizer.fit_transform(newsgroups_train.text)
count_vectors_test = count_vectorizer.transform(newsgroups_test.text)
    
x_train, y_train = count_vectors_train, newsgroups_train.target.to_numpy()
x_test, y_test= count_vectors_test, newsgroups_test.target.to_numpy()
model_MNB = MultinomialNaiveBayes()
model_MNB.fit(x_train, y_train)
y_prob_MNB, y_pred_MNB = model_MNB.predict(x_test)
accuracy_MNB,recall_MNB,precision_MNB,Selectivity_MNB,N_predictive_v_MNB=evaluate_acc(x_test, y_test, y_pred_MNB)
print(f'test accuracy: {accuracy_MNB}') 


test accuracy: 0.247




test accuracy: 0.6150570133260064
