<a href="https://colab.research.google.com/github/ssvadla/Demonstration/blob/main/PU_EM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

class PULearning:
  """Implementation of PULearning (two-step strategy & cost-sensitive strategy)"""
  def __init__(self, P, U, clf_one, clf_two, Cplus, Cminus=1, sample_ratio=0.15, 
                theta='auto', random_state=2018):
      """
      :param P: Observed positive samples.
      :param U: Unlabeled datasets.
      :param clf_one: A Classifer used to determine reliable negative samples must be able to predict probability.
      :param clf_two: A Classifer fit positive samples and reliable negative samples, and predict the unlabeled.
      :param Cplus: the cost of not identifying positive samples, cost(FN) 
      :param Cminus: the cost of not identifying negative samples, cost(FP) 
      :param sample_ratio: the proportion of spy samples
      :param theta: the probability threshold of judging an unlabeled sample as a reliable negative sample
      """
      self.P = P  
      self.U = U   
      assert clf_one.predict_proba, 'need predict_proba method to return probability estimates'
      self.clf_one = clf_one  
      self.clf_two = clf_two 
      self.Cplus = Cplus
      self.Cminus = Cminus
      self.theta = theta
      self.sample_ratio = 0.15 if sample_ratio=='auto' else sample_ratio
      self.random_state = random_state
      
    
  # Two-Stage Strategy: Select Reliable Negative Instances
  def select_reliable_negative(self): 
      pos_num = len(self.P)
      print("@@@",pos_num)
      spy_num = int(pos_num * self.sample_ratio)
      pos_random_indices = np.random.RandomState(self.random_state).permutation(pos_num)
      spy_indices, unspy_indices = pos_random_indices[:spy_num], pos_random_indices[spy_num:]
      spy_set, unspy_set = self.P[spy_indices, :], self.P[unspy_indices, :]
      
      negative_set = np.r_[self.U, spy_set]
      positive_set = unspy_set 
      negative_label = np.zeros(len(negative_set)).astype(int)   
      positive_label = np.ones(len(positive_set)).astype(int)
    
      X_train_one = np.r_[negative_set, positive_set]
      y_train_one = np.r_[negative_label, positive_label].astype(int)    
      clf_one = self.clf_one.fit(X_train_one, y_train_one)
      
      y_prob_U = clf_one.predict_proba(self.U)[:, 1]
      y_prob_spy = clf_one.predict_proba(spy_set)[:, 1]
      
      theta = np.min(y_prob_spy) if self.theta == 'auto' else self.theta
      assertion = 'theta must not be greater than the minimum value of spy_prob so that \
          all spy are predicted to be positive samples'
      assert theta <= np.min(y_prob_spy), assertion
      
      # rn: reliable_negative
      rn = self.U[y_prob_U <= theta, :] 
      return rn
  
  def predict(self):
      # 对可靠负样本集的赋予标签0
      rn = self.select_reliable_negative()    
      X_train_two = np.r_[self.P, rn]
      y_train_two = np.r_[np.ones(len(self.P)), np.zeros(len(rn))].astype(int)
      weights = np.array([self.Cplus if i else self.Cminus for i in y_train_two])
      
      clf_two = self.clf_two
      clf_two.fit(X_train_two, y_train_two, sample_weight=weights)
      y_pred = clf_two.predict(self.U)
      
      if clf_two.predict_proba:
          y_prob = clf_two.predict_proba(self.U)[:, -1]
      return y_pred, y_prob


In [3]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

train1 = pd.read_csv('/content/drive/My Drive/Research/train_data1.csv')
train2 = pd.read_csv('/content/drive/My Drive/Research/train_data2.csv')
train3 = pd.read_csv('/content/drive/My Drive/Research/train_data3.csv')
train4 = pd.read_csv('/content/drive/My Drive/Research/train_data4.csv')
train5 = pd.read_csv('/content/drive/My Drive/Research/train_data5.csv')
train6 = pd.read_csv('/content/drive/My Drive/Research/train_data6.csv')
train7 = pd.read_csv('/content/drive/My Drive/Research/train_data7.csv')
train8 = pd.read_csv('/content/drive/My Drive/Research/train_data8.csv')
train9 = pd.read_csv('/content/drive/My Drive/Research/train_data9.csv')
train10 = pd.read_csv('/content/drive/My Drive/Research/train_data10.csv')
train_highKappa = pd.read_csv('/content/drive/My Drive/Research/train_data_highkappa.csv')
train1.head()

Mounted at /content/drive


Unnamed: 0.1,Unnamed: 0,Sentence,Target
0,659,Appellant had stated to the officers that she ...,Invalid
1,3456,We shall discuss the facts more fully in conne...,Others
2,2043,"â€œPerjury is a false statement, either writte...",Invalid
3,3344,The offense is felony theft by false pretext; ...,Issue
4,3231,Numerous contentions urging the commission of ...,Issue


In [4]:
len(train1)


3476

In [5]:
len(train2)

3476

In [6]:
train = train1
train_list = [train2,train3,train4,train5,train6,train7,train8,train9,train10,train_highKappa]
for i in train_list:

  train = train.append(i)



In [7]:
print(len(train))

37711


In [8]:
train.sort_values("Sentence", inplace = True)
print(len(train))


37711


In [9]:
train.head(10)

Unnamed: 0.1,Unnamed: 0,Sentence,Target
1364,239,"""(I)n the First Amendment area 'government may...",Invalid
1364,239,"""(I)n the First Amendment area 'government may...",Invalid
3342,264,"""(I)n the First Amendment area 'government may...",Invalid
1364,239,"""(I)n the First Amendment area 'government may...",Invalid
1364,239,"""(I)n the First Amendment area 'government may...",Invalid
3083,264,"""(I)n the First Amendment area 'government may...",Invalid
1364,239,"""(I)n the First Amendment area 'government may...",Invalid
1364,239,"""(I)n the First Amendment area 'government may...",Invalid
1364,239,"""(I)n the First Amendment area 'government may...",Invalid
1364,239,"""(I)n the First Amendment area 'government may...",Invalid


In [10]:
 new_train = train.drop_duplicates(subset ="Sentence")


In [11]:
len(train)

37711

In [12]:
len(new_train)

4416

In [13]:
new_train.head(10)

Unnamed: 0.1,Unnamed: 0,Sentence,Target
1364,239,"""(I)n the First Amendment area 'government may...",Invalid
1833,185,"""... that nowhere in the statute was it stated...",Invalid
2431,415,"""Although a statute may be neither vague, over...",Rule/Law/Holding
2245,416,"""For even when pursuing a legitimate interest,...",Rule/Law/Holding
1561,25,"""If an indictment has been found or accusation...",Facts
1531,464,"""It is generally believed that one of the reas...",Analysis
3141,0,"""Many decisions have recognized that these ter...",Analysis
774,398,"""Mr. Haut [counsel for plaintiffs]: In exchang...",Facts
811,418,"""So long as the statute remains available to t...",Rule/Law/Holding
3300,1,"""That there may be marginal cases in which it ...",Rule/Law/Holding


In [14]:
train = new_train

In [15]:
train['Target'].unique()

array(['Invalid', 'Rule/Law/Holding', 'Facts', 'Analysis', 'Others',
       'Conclusion', 'Issue'], dtype=object)

In [16]:
train['Target']=train['Target'].replace(['Others'],'Invalid')
train['Target'].unique()


array(['Invalid', 'Rule/Law/Holding', 'Facts', 'Analysis', 'Conclusion',
       'Issue'], dtype=object)

In [17]:
#cleaning
import nltk
import re
import string
nltk.download('stopwords')
nltk.download('wordnet')
stopword=nltk.corpus.stopwords.words('english')
from nltk.stem import WordNetLemmatizer
wl= WordNetLemmatizer()

def clean_text(text):
  text="".join([word.lower() for word in text if word not in string.punctuation])
  tokens = re.split('\W+',text)
  text = [wl.lemmatize(word) for word in tokens if word not in stopword]
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer = clean_text)
X_tfidf = tfidf_vect.fit_transform(train['Sentence'])
print(X_tfidf.shape)

(4416, 7374)


In [29]:
train.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Target
1364,239,"""(I)n the First Amendment area 'government may...",Invalid
1833,185,"""... that nowhere in the statute was it stated...",Invalid
2431,415,"""Although a statute may be neither vague, over...",Rule/Law/Holding
2245,416,"""For even when pursuing a legitimate interest,...",Rule/Law/Holding
1561,25,"""If an indictment has been found or accusation...",Facts


In [19]:
test = pd.read_csv(r'/content/drive/My Drive/Research/test_data.csv')

test['Target']=test['Target'].replace(['Others'],'Invalid')
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
test['Sentence'] = test['Sentence'].str.replace('[^\w\s]','')
from nltk.corpus import stopwords
words = stopwords.words('english')
test['Sentence'] = test['Sentence'].apply(lambda x: " ".join(x for x in x.split() if x not in words))
t_p = tfidf_vect.transform(test['Sentence'])

In [20]:
import lightgbm as lgb
classifier = lgb.LGBMClassifier()

In [21]:
unlabel = pd.read_csv(r'/content/drive/My Drive/Research/Unlabeled_data.csv')
#unlabel.head()

del unlabel['Complete']
del unlabel['Unnamed: 0']

unlabel['text'] = unlabel['text'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
unlabel['text'] = unlabel['text'].str.replace('[^\w\s]','')
from nltk.corpus import stopwords
words = stopwords.words('english')
unlabel['text'] = unlabel['text'].apply(lambda x: " ".join(x for x in x.split() if x not in words))


from textblob import TextBlob
from textblob import Word
nltk.download('wordnet')
nltk.download('punkt')
unlabel['text'] = unlabel['text'].apply(lambda x: TextBlob(x).words)
unlabel['text'] = unlabel['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x]))

unlabel_1 = unlabel.loc[:100]

def index_reset(unlabel_2):
  unlabel_2.reset_index(inplace=True)
  del unlabel_2['index']
  #print(unlabel_2.head())
  return unlabel_2

unlabel_1 = index_reset(unlabel_1)

x_un1 = tfidf_vect.transform(unlabel_1['text'])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [22]:
p

NameError: ignored

In [36]:
# Author：马肖
# E-mail：maxiaoscut@aliyun.com
# Github：https://github.com/Albertsr

from sklearn.svm import SVC
import lightgbm as lgb

#svc = SVC(C=1.0, kernel='rbf', gamma='auto', probability=True, random_state=2018)
svc = lgb.LGBMClassifier()
def biased_svm(cost_fp, cost_fn, P, U, svm=svc, return_proba=True):
    assert cost_fn > cost_fp > 0, '对FN应赋予更高的代价' 
    
    X_train = np.r_[P, U]
    y_train = np.r_[np.ones(len(P)), np.zeros(len(U))]
    weight = [cost_fn if i else cost_fp for i in y_train]
    svm.fit(X_train, y_train, sample_weight=weight) 
    y_pred = svm.predict(U)
    if return_proba:
        y_prob = svm.predict_proba(U)[:, -1]
        return y_pred, y_prob
    else:
        return y_pred

In [40]:
train.shape

(4416, 2)

In [41]:
unlabel_1.shape

(101, 1)

In [50]:
# Author：马肖
# E-mail：maxiaoscut@aliyun.com
# Github：https://github.com/Albertsr

from sklearn.svm import SVC
import lightgbm as lgb

#svc = SVC(C=1.0, kernel='rbf', gamma='auto', probability=True, random_state=2018)
svc = lgb.LGBMClassifier()
def biased_svm_modi(cost_fp, cost_fn, P, U, train_total, svm=svc, return_proba=True):
    assert cost_fn > cost_fp > 0, '对FN应赋予更高的代价' 
    
    #X_train = np.r_[P, U]
    X_train = train_total
    y_train = np.r_[np.ones(len(P)), np.zeros(len(U))]
    weight = [cost_fn if i else cost_fp for i in y_train]
    svm.fit(X_train, y_train, sample_weight=weight) 
    y_pred = svm.predict(U)
    if return_proba:
        y_prob = svm.predict_proba(U)[:, -1]
        return y_pred, y_prob
    else:
        return y_pred

In [42]:
total_train = train + unlabel_1

In [46]:
total_train = pd.concat([train, unlabel_1], axis=0)

In [52]:
train_df = pd.DataFrame(X_tfidf.toarray())

In [None]:
train_df = pd.DataFrame(X_tfidf.toarray())

In [47]:
total_train.shape

(4517, 3)

In [49]:
total_train.tail()

Unnamed: 0,Sentence,Target,text
96,,,witness question hostile state mannerism attit...
97,,,court volition called witness stand tendered m...
98,,,neither side permitted lead
99,,,state defendant examined developed testimony f...
100,,,thus examined m testimony record


In [34]:
model_1 = PULearning(train,unlabel_1, classifier, classifier,1 )


In [51]:
model_svm = biased_svm_modi(1,5,train, unlabel_1,total_train)

ValueError: ignored

In [32]:
train.columns

Index(['Sentence', 'Target'], dtype='object')

In [31]:
del train['Unnamed: 0']

In [33]:
train.head()

Unnamed: 0,Sentence,Target
1364,"""(I)n the First Amendment area 'government may...",Invalid
1833,"""... that nowhere in the statute was it stated...",Invalid
2431,"""Although a statute may be neither vague, over...",Rule/Law/Holding
2245,"""For even when pursuing a legitimate interest,...",Rule/Law/Holding
1561,"""If an indictment has been found or accusation...",Facts


In [35]:
model_1.predict()

@@@ 4416


TypeError: ignored

In [None]:
model_1.select_reliable_negative()

In [None]:
model_2 = PULearning(X_tfidf,x_un1, classifier, classifier,1 )

In [None]:
model_2.select_reliable_negative()