<a href="https://colab.research.google.com/github/ssvadla/Research_01/blob/main/EM_NB_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:


  
# The code in this module is adapted from scikit-learn's Naive Bayes classifier.
# The original code has been altered to implement the semi-supervised version of
# Naive Bayes described in Section 5.3.1 of the following paper:

# K. Nigam, A.K. McCallum, S. Thrun, T. Mitchell (2000). Text classification
# from labeled and unlabeled documents using EM. Machine Learning 39(2-3),
# pp. 103-134.

#
# Original copyright notice below:
# Author: Vincent Michel <vincent.michel@inria.fr>
#         Minor fixes by Fabian Pedregosa
#         Amit Aides <amitibo@tx.technion.ac.il>
#         Yehuda Finkelstein <yehudaf@tx.technion.ac.il>
#         Lars Buitinck
#         Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
#         (parts based on earlier work by Mathieu Blondel)
#
# License: BSD 3 clause

import numpy as np
from scipy.sparse import issparse

from sklearn.naive_bayes import _BaseDiscreteNB
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.utils.validation import check_is_fitted

class MultinomialNBSS(_BaseDiscreteNB):
    """
    Semi-supervised Naive Bayes classifier for multinomial models.  Unlabeled
    data must be marked with -1.  In comparison to the standard scikit-learn
    MultinomialNB classifier, the main differences are in the _count and fit
    methods.
    Parameters
    ----------
    alpha : float, optional (default=1.0)
        Additive (Laplace/Lidstone) smoothing parameter
        (0 for no smoothing).
    beta : float, optional (default=1.0)
        Weight applied to the contribution of the unlabeled data
        (0 for no contribution).
    fit_prior : boolean, optional (default=True)
        Whether to learn class prior probabilities or not.
        If false, a uniform prior will be used.
    class_prior : array-like, size (n_classes,), optional (default=None)
        Prior probabilities of the classes. If specified the priors are not
        adjusted according to the data.
    tol : float, optional (default=1e-3)
        Tolerance for convergence of EM algorithm.
    max_iter : int, optional (default=20)
        Maximum number of iterations for EM algorithm.
    verbose : boolean, optional (default=True)
        Whether to output updates during the running of the EM algorithm.
    Attributes
    ----------
    class_log_prior_ : array, shape (n_classes, )
        Smoothed empirical log probability for each class.
    intercept_ : array, shape (n_classes, )
        Mirrors ``class_log_prior_`` for interpreting MultinomialNBSS
        as a linear model.
    feature_log_prob_ : array, shape (n_classes, n_features)
        Empirical log probability of features
        given a class, ``P(x_i|y)``.
    coef_ : array, shape (n_classes, n_features)
        Mirrors ``feature_log_prob_`` for interpreting MultinomialNBSS
        as a linear model.
    class_count_ : array, shape (n_classes,)
        Number of samples encountered for each class during fitting. This
        value is weighted by the sample weight when provided.
    feature_count_ : array, shape (n_classes, n_features)
        Number of samples encountered for each (class, feature)
        during fitting. This value is weighted by the sample weight when
        provided.
    Examples
    --------
    >>> import numpy as np
    >>> X = np.random.randint(5, size=(6, 100))
    >>> y = np.array([1, 2, 3, 4, 5, 6])
    >>> from semi_supervised_naive_bayes import MultinomialNBSS
    >>> clf = MultinomialNBSS()
    >>> clf.fit(X, y)
    MultinomialNBSS(alpha=1.0, class_prior=None, fit_prior=True)
    >>> print(clf.predict(X[2:3]))
    [3]
    Notes
    -----
    For the rationale behind the names `coef_` and `intercept_`, i.e.
    naive Bayes as a linear classifier, see J. Rennie et al. (2003),
    Tackling the poor assumptions of naive Bayes text classifiers, ICML.
    References
    ----------
    C.D. Manning, P. Raghavan and H. Schuetze (2008). Introduction to
    Information Retrieval. Cambridge University Press, pp. 234-265.
    https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
    K. Nigam, A.K. McCallum, S. Thrun, T. Mitchell (2000). Text classification
    from labeled and unlabeled documents using EM. Machine Learning 39(2-3),
    pp. 103-134.
    """

    def __init__(self, alpha=1.0, beta=1.0, fit_prior=True, class_prior=None,
                 tol=1e-3, max_iter=20, verbose=True):
        self.alpha = alpha
        self.beta = beta
        self.fit_prior = fit_prior
        self.class_prior = class_prior
        self.tol = tol
        self.max_iter = max_iter
        self.verbose = verbose

    def _count(self, X, Y, U_X=np.array([]), U_prob=np.array([])):
        """Count and smooth feature occurrences."""
        if np.any((X.data if issparse(X) else X) < 0):
            raise ValueError("Input X must be non-negative")

        self.feature_count_ = safe_sparse_dot(Y.T, X)
        self.class_count_ = Y.sum(axis=0)

        if U_X.shape[0] > 0:
            self.feature_count_ += self.beta*safe_sparse_dot(U_prob.T, U_X)
            self.class_count_ += self.beta*U_prob.sum(axis=0)
        else:
            self.feature_count_ = safe_sparse_dot(Y.T, X)
            self.class_count_ = Y.sum(axis=0)

    def _update_feature_log_prob(self, alpha):
        """Apply smoothing to raw counts and recompute log probabilities"""
        smoothed_fc = self.feature_count_ + alpha
        smoothed_cc = smoothed_fc.sum(axis=1)

        self.feature_log_prob_ = (np.log(smoothed_fc) -
                                  np.log(smoothed_cc.reshape(-1, 1)))

    def _joint_log_likelihood(self, X):
        """Calculate the posterior log probability of the samples X"""
        check_is_fitted(self, "classes_")

        X = check_array(X, accept_sparse='csr')
        return (safe_sparse_dot(X, self.feature_log_prob_.T) +
                self.class_log_prior_)

    def partial_fit(self, X, y, classes=None, sample_weight=None):
        """A semi-supervised version of this method has not been implemented.
        """

    def fit(self, X, y, sample_weight=None):
        """Fit semi-supervised Naive Bayes classifier according to X, y
        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.  Unlabeled data must be marked with -1.
        sample_weight : array-like, shape = [n_samples], (default=None)
            Weights applied to individual samples (1. for unweighted).
        Returns
        -------
        self : object
        """
        X, y = check_X_y(X, y, 'csr')
        _, n_features = X.shape
        # Unlabeled data are marked with -1
        unlabeled = np.flatnonzero(y == -1)
        labeled = np.setdiff1d(np.arange(len(y)), unlabeled)

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y[labeled])
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
        # We convert it to np.float64 to support sample_weight consistently;
        # this means we also don't have to cast X to floating point
        Y = Y.astype(np.float64, copy=False)
        if sample_weight is not None:
            sample_weight = np.atleast_2d(sample_weight)
            Y *= check_array(sample_weight).T

        class_prior = self.class_prior

        # Count raw events from data before updating the class log prior
        # and feature log probas
        n_effective_classes = Y.shape[1]

        alpha = self._check_alpha()
        self._count(X[labeled], Y)


        self._update_feature_log_prob(alpha)
        self._update_class_log_prior(class_prior=class_prior)
        jll = self._joint_log_likelihood(X)
        sum_jll = jll.sum()

        # Run EM algorithm
        if len(unlabeled) > 0:
            self.num_iter = 0
            pred = self.predict(X)
            while self.num_iter < self.max_iter:
                self.num_iter += 1
                prev_sum_jll = sum_jll

                # First, the E-step:
                prob = self.predict_proba(X[unlabeled])

                # Then, the M-step:
                self._count(X[labeled], Y, X[unlabeled], prob)
                self._update_feature_log_prob(self.beta)
                self._update_class_log_prior(class_prior=class_prior)

                jll = self._joint_log_likelihood(X)
                sum_jll = jll.sum()
                if self.verbose:
                    print(
                        'Step {}: jll = {:f}'.format(self.num_iter, sum_jll)
                    )

                if self.num_iter > 1 and prev_sum_jll - sum_jll < self.tol:
                    break

            if self.verbose:
                end_text = 's.' if self.num_iter > 1 else '.'
                print(
                    'Optimization converged after {} '
                    'iteration'.format(self.num_iter)
                    + end_text
                )

        return self


In [3]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from google.colab import drive

drive.mount('/content/drive')

train1 = pd.read_csv('/content/drive/My Drive/Research/train_data1.csv')
train2 = pd.read_csv('/content/drive/My Drive/Research/train_data2.csv')
train3 = pd.read_csv('/content/drive/My Drive/Research/train_data3.csv')
train4 = pd.read_csv('/content/drive/My Drive/Research/train_data4.csv')
train5 = pd.read_csv('/content/drive/My Drive/Research/train_data5.csv')
train6 = pd.read_csv('/content/drive/My Drive/Research/train_data6.csv')
train7 = pd.read_csv('/content/drive/My Drive/Research/train_data7.csv')
train8 = pd.read_csv('/content/drive/My Drive/Research/train_data8.csv')
train9 = pd.read_csv('/content/drive/My Drive/Research/train_data9.csv')
train10 = pd.read_csv('/content/drive/My Drive/Research/train_data10.csv')
train_highKappa = pd.read_csv('/content/drive/My Drive/Research/train_data_highkappa.csv')
train1.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,659,Appellant had stated to the officers that she ...,Invalid
1,3456,We shall discuss the facts more fully in conne...,Others
2,2043,"â€œPerjury is a false statement, either writte...",Invalid
3,3344,The offense is felony theft by false pretext; ...,Issue
4,3231,Numerous contentions urging the commission of ...,Issue


In [4]:
train = train1
train_list = [train2,train3,train4,train5,train6,train7,train8,train9,train10,train_highKappa]
for i in train_list:
  #print(i)
  train = train.append(i)



In [5]:
train.sort_values("Sentence", inplace = True)
print(len(train))


37711


In [6]:
 new_train = train.drop_duplicates(subset ="Sentence")


In [7]:
train = new_train

In [8]:
train['Target'].unique()

array(['Invalid', 'Rule/Law/Holding', 'Facts', 'Analysis', 'Others',
       'Conclusion', 'Issue'], dtype=object)

In [9]:
train['Target']=train['Target'].replace(['Others'],'Invalid')
train['Target'].unique()


array(['Invalid', 'Rule/Law/Holding', 'Facts', 'Analysis', 'Conclusion',
       'Issue'], dtype=object)

In [10]:
#cleaning
import nltk
import re
import string
nltk.download('stopwords')
nltk.download('wordnet')
stopword=nltk.corpus.stopwords.words('english')
from nltk.stem import WordNetLemmatizer
wl= WordNetLemmatizer()

def clean_text(text):
  text="".join([word.lower() for word in text if word not in string.punctuation])
  tokens = re.split('\W+',text)
  text = [wl.lemmatize(word) for word in tokens if word not in stopword]
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfidf_vect.fit_transform(train['Sentence'])
print(X_tfidf.shape)

(4416, 7374)


In [12]:
test = pd.read_csv(r'/content/drive/My Drive/Research/test_data.csv')

test['Target']=test['Target'].replace(['Others'],'Invalid')
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
test['Sentence'] = test['Sentence'].str.replace('[^\w\s]','')
from nltk.corpus import stopwords
words = stopwords.words('english')
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in words))
t_p = tfidf_vect.transform(test['Sentence'])

In [13]:

unlabel = pd.read_csv(r'/content/drive/My Drive/Research/Unlabeled_data.csv')
#unlabel.head()

del unlabel['Complete']
del unlabel['Unnamed: 0']

unlabel['text'] = unlabel['text'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
unlabel['text'] = unlabel['text'].str.replace('[^\w\s]','')
from nltk.corpus import stopwords
words = stopwords.words('english')
unlabel['text'] = unlabel['text'].apply(lambda x: " ".join(x for x in x.split() if x not in words))


from textblob import TextBlob
from textblob import Word
nltk.download('wordnet')
nltk.download('punkt')
unlabel['text'] = unlabel['text'].apply(lambda x: TextBlob(x).words)
unlabel['text'] = unlabel['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x]))

unlabel_1 = unlabel.loc[:500]


def index_reset(unlabel_2):
  unlabel_2.reset_index(inplace=True)
  del unlabel_2['index']
  #print(unlabel_2.head())
  return unlabel_2

unlabel_1 = index_reset(unlabel_1)
unlabel_1_copy = unlabel_1



x_un1 = tfidf_vect.transform(unlabel_1['text'])






[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [14]:
classifier = MultinomialNBSS()

In [15]:
classifier.fit(X_tfidf,train['Target'])

MultinomialNBSS(alpha=1.0, beta=1.0, class_prior=None, fit_prior=True,
                max_iter=20, tol=0.001, verbose=True)

In [17]:
print(classifier.predict(t_p))

['Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Invalid'
 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Rule/Law/Holding'
 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Invalid' 'Facts' 'Facts'
 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Analysis' 'Conclusion' 'Facts'
 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts'
 'Facts' 'Invalid' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Analysis'
 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts'
 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts'
 'Analysis' 'Facts' 'Facts' 'Facts' 'Facts' 'Invalid' 'Facts' 'Facts'
 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Invalid' 'Facts' 'Facts' 'Facts'
 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts'
 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Invalid' 'Facts' 'Facts' 'Facts'
 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts'
 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Facts' 'Fact

In [18]:
unlabel_1['Target']=-1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [19]:
import numpy as np
np.unique(unlabel_1['Target'])

array([-1])

In [20]:
train = train.rename(columns={'Sentence':'text'})

In [21]:
 from sklearn.preprocessing import LabelEncoder
 train['Target']= LabelEncoder().fit_transform(train['Target'])

In [22]:
train_and_unlabel =  pd.concat([train,unlabel_1])
print(len(train))
print(len(unlabel_1))
print(len(train_and_unlabel))

4416
501
4917


In [23]:
train_and_unlabel.head()

Unnamed: 0.1,Unnamed: 0,text,Target
1364,239.0,"""(I)n the First Amendment area 'government may...",3
1833,185.0,"""... that nowhere in the statute was it stated...",3
2431,415.0,"""Although a statute may be neither vague, over...",5
2245,416.0,"""For even when pursuing a legitimate interest,...",5
1561,25.0,"""If an indictment has been found or accusation...",2


In [24]:

train_and_unlabel.reset_index(inplace=True)

In [25]:
train_and_unlabel.columns

Index(['index', 'Unnamed: 0', 'text', 'Target'], dtype='object')

In [26]:
del train_and_unlabel['index']
del train_and_unlabel['Unnamed: 0']

In [27]:
train_and_unlabel.head()

Unnamed: 0,text,Target
0,"""(I)n the First Amendment area 'government may...",3
1,"""... that nowhere in the statute was it stated...",3
2,"""Although a statute may be neither vague, over...",5
3,"""For even when pursuing a legitimate interest,...",5
4,"""If an indictment has been found or accusation...",2


In [28]:
train_and_unlabel.tail()

Unnamed: 0,text,Target
4912,time continuance partnership mr winn jack stal...,-1
4913,correspondence appellant went coleman county s...,-1
4914,appellant contention stock sold winn named pri...,-1
4915,mr winns contention stock delivered appellant ...,-1
4916,title stock passed appellant would guilty embe...,-1


In [29]:
train_and_unlabel_vect = tfidf_vect.transform(train_and_unlabel['text'])

In [30]:
train_and_unlabel_vect_df=pd.DataFrame(train_and_unlabel_vect.toarray())


In [31]:
train_and_unlabel_vect_df.shape

(4917, 7374)

In [32]:
classifier = MultinomialNBSS()

In [33]:
classifier.fit(train_and_unlabel_vect_df, train_and_unlabel['Target'])

Step 1: jll = -863064.748069
Step 2: jll = -863710.698838
Step 3: jll = -863876.562964
Step 4: jll = -863927.697659
Step 5: jll = -863945.467258
Step 6: jll = -863952.127077
Step 7: jll = -863954.756179
Step 8: jll = -863955.835135
Step 9: jll = -863956.291815
Step 10: jll = -863956.490150
Step 11: jll = -863956.578213
Step 12: jll = -863956.618076
Step 13: jll = -863956.636429
Step 14: jll = -863956.645005
Step 15: jll = -863956.649063
Step 16: jll = -863956.651005
Step 17: jll = -863956.651943
Optimization converged after 17 iterations.


MultinomialNBSS(alpha=1.0, beta=1.0, class_prior=None, fit_prior=True,
                max_iter=20, tol=0.001, verbose=True)

In [35]:
test_pred = classifier.predict(t_p)

In [None]:
#test_pred = gnb_classifier.predict(t_p.toarray())

In [36]:
from sklearn.metrics import classification_report
test['Target']= LabelEncoder().fit_transform(test['Target'])
classification_report_test = classification_report(test['Target'],test_pred,digits=4)
print(classification_report_test)

              precision    recall  f1-score   support

           0     0.4000    0.0779    0.1304        77
           1     1.0000    0.1154    0.2069        26
           2     0.5485    0.9963    0.7074       267
           3     0.8125    0.1566    0.2626        83
           4     0.0000    0.0000    0.0000        34
           5     0.5000    0.0294    0.0556        34

    accuracy                         0.5547       521
   macro avg     0.5435    0.2293    0.2272       521
weighted avg     0.5522    0.5547    0.4376       521



  _warn_prf(average, modifier, msg_start, len(result))
