# Co-training

**Autori:** Peter Macinec, Lukas Janik, Vajk Pomichal, Frantisek Sefcik

## Zakladne nastavenia a import kniznic

In [1]:
import pandas as pd
import numpy as np


# plots
import matplotlib.pyplot as plt
import seaborn as sns

import json

import re

import nltk
from nltk.stem.porter import PorterStemmer
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection as ms

from sklearn.metrics import confusion_matrix

pd.options.mode.chained_assignment = None  # default='warn'

### Nacitanie datasetu

Nase data su dostupne v dvoch suboroch, *train.tsv* a *test.tsv*. Nacitame ich oba a vykoname na nich zakladnu analyzu. Zdroj: https://www.kaggle.com/c/stumbleupon

In [2]:
# trenovacie data
df = pd.read_csv('data/train.tsv', sep='\t')

In [3]:
# testovacie data
df_t = pd.read_csv('data/test.tsv', sep='\t')

## Textove atributy

Najskor predspracujeme text. Ziskame ho z atributu boilerplate:

In [4]:
df['body_content'] = df['boilerplate'].apply(lambda x: json.loads(x)['body'])

Teraz odstranime vsetky znaky, ktore nie su znaky slov. Pouzijeme na to regularne vyrazy:

In [5]:
df['body_content'].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

In [6]:
df['body_content'] = df['body_content'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', str(x)))

Este ako male upravy, aby nas slovnik obsahoval kazde slovo len raz, dame ich vsetky na lowercase a rozdelime texty na slova, aby sme ich nasledne mohli spracovat:

In [7]:
df['body_content'] = df['body_content'].apply(lambda x: str(x).lower().split())

Teraz potrebujeme este odstranit slova, ktore nedavaju vyznam. O jednom raze prevedieme slova na ich korenovy zaklad pouzitim stemmingu:

In [8]:
porter_stemmer = PorterStemmer()
stopwords = set(stopwords.words('english'))

In [9]:
df['body_content'] = df['body_content'].apply(lambda x: [porter_stemmer.stem(word) for word in x if word not in stopwords])

Teraz uz mame vsetky slova pripravene, uz ich len naspat spojime do jednej suvislej vety, aby s nimi vedeli lahsie pracovat algoritmy spracovania textu:

In [10]:
df['body_content_final'] = df['body_content'].apply(lambda x: ' '.join(x))

### TF-IDF

In [11]:
tv = TfidfVectorizer(max_features = 1000)
tf_idf = tv.fit_transform(df['body_content_final']).toarray()

### Atribut URL

In [12]:
df['url_new'] = df['url'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', str(x)))
df['url_new'] = df['url_new'].apply(lambda x: str(x).lower().split())

In [13]:
df['url_new'] = df['url_new'].apply(lambda x: [porter_stemmer.stem(word) for word in x if word not in stopwords])

In [14]:
df['url_final'] = df['url_new'].apply(lambda x: ' '.join(x))

In [15]:
tv_url = TfidfVectorizer(max_features = 1000)
tf_idf_url = tv_url.fit_transform(df['url_final']).toarray()

## Numericke atributy

In [16]:
num_feature_set = ['avglinksize', 'commonlinkratio_1', 'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
                   'hasDomainLink','lengthyLinkDomain','linkwordscore','numberOfLinks',
                   'numwords_in_url', 'parametrizedLinkRatio']

## Benchmark model

Natrenujeme benchmarkovy model, ktory bude natrenovany na vsetkych atributoch. Jeho vysledky sa nasledne budeme snazit dosiahnut s minimom oznacenych dat s co-trainingom.

In [17]:
y = df.label

In [18]:
df1 = df.loc[:, num_feature_set]

In [19]:
df2 = pd.DataFrame(tf_idf)

In [20]:
X_temp = pd.concat([df2, pd.DataFrame(tf_idf_url)], axis=1, join_axes=[df2.index])
X_temp.columns = list(range(0,2000))

In [21]:
X = pd.concat([df1, X_temp], axis=1, join_axes=[df1.index])

In [22]:
X.head()

Unnamed: 0,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,hasDomainLink,lengthyLinkDomain,linkwordscore,numberOfLinks,numwords_in_url,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,2.055556,0.676471,0.205882,0.047059,0.023529,0,1,24,170,8,...,0.0,0.0,0.0,0.154042,0.0,0.0,0.0,0.0,0.0,0.0
1,3.677966,0.508021,0.28877,0.213904,0.144385,0,1,40,187,9,...,0.0,0.0,0.0,0.101398,0.0,0.0,0.0,0.0,0.0,0.0
2,2.382883,0.562016,0.321705,0.120155,0.042636,0,1,55,258,11,...,0.0,0.0,0.0,0.062961,0.0,0.0,0.0,0.0,0.0,0.0
3,1.543103,0.4,0.1,0.016667,0.0,0,0,24,120,5,...,0.0,0.0,0.0,0.130125,0.0,0.0,0.0,0.0,0.0,0.0
4,2.676471,0.5,0.222222,0.123457,0.04321,0,1,14,162,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
col_names1 = list(range(0,1000))
col_names1 = num_feature_set + col_names1
col_names2 = list(range(1000,2000))
# col_names2 = [str(i) for i in col_names2]

In [24]:
# X = pd.concat([df1, pd.DataFrame(tf_idf)], axis=1, join_axes=[df1.index])
# X = pd.concat([X, pd.DataFrame(tf_idf_url)], axis=1, join_axes=[X.index])

In [25]:
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
benchmark_clf = RandomForestClassifier(n_estimators=500, max_depth=20,
                              random_state=10)

In [27]:
benchmark_clf.fit(X_train, y_train)
y_pred = benchmark_clf.predict(X_test)

In [28]:
confusion_matrix(y_test, y_pred)

array([[655,  75],
       [218, 531]], dtype=int64)

In [29]:
accuracy_score(y_test, y_pred)

0.801893171061528

### Already implemented co-training
https://github.com/jjrob13/sklearn_cotraining

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# import classifiers
# from classifiers import CoTrainingClassifier

In [61]:
import numpy as np
import random
import copy
class CoTrainingClassifier(object):

    def __init__(self, clf, clf2=None, p=-1, n=-1, k=30, u = 75):
        self.clf1_ = clf
        #we will just use a copy of clf (the same kind of classifier) if clf2 is not specified
        if clf2 == None:
            self.clf2_ = copy.copy(clf)
        else:
            self.clf2_ = clf2

        #if they only specify one of n or p, through an exception
        if (p == -1 and n != -1) or (p != -1 and n == -1):
            raise ValueError('Current implementation supports either both p and n being specified, or neither')

        self.p_ = p
        self.n_ = n
        self.k_ = k
        self.u_ = u

        random.seed()


    def fit(self, X1, X2, y):
        #we need y to be a numpy array so we can do more complex slicing
        y = np.asarray(y)

        #set the n and p parameters if we need to
        if self.p_ == -1 and self.n_ == -1:
            num_pos = sum(1 for y_i in y if y_i == 1)
            num_neg = sum(1 for y_i in y if y_i == 0)

            n_p_ratio = num_neg / float(num_pos)

            if n_p_ratio > 1:
                self.p_ = 1
                self.n_ = round(self.p_*n_p_ratio)

            else:
                self.n_ = 1
                self.p_ = round(self.n_/n_p_ratio)

        assert(self.p_ > 0 and self.n_ > 0 and self.k_ > 0 and self.u_ > 0)

        #the set of unlabeled samples
        U = [i for i, y_i in enumerate(y) if y_i == -1]
#         U, _ = np.where(y == -1)
        # U = y[y == -1].index.values

        #we randomize here, and then just take from the back so we don't have to sample every time
        random.shuffle(U)

        #this is U' in paper
        U_ = U[-min(len(U), self.u_):]

        #the samples that are initially labeled
#         L, _ = np.where(y != -1)
        L = [i for i, y_i in enumerate(y) if y_i != -1]

        #remove the samples in U_ from U
        U = U[:-len(U_)]


        it = 0 #number of cotraining iterations we've done so far

        #loop until we have assigned labels to everything in U or we hit our iteration break condition
        while it != self.k_ and len(U) > 0:
            it += 1

            self.clf1_.fit(X1.loc[L], y[L])
            self.clf2_.fit(X2.loc[L], y[L])

            y1 = self.clf1_.predict(X1[U_])
            y2 = self.clf2_.predict(X2[U_])

            n, p = [], []
    
            for i, (y1_i, y2_i) in enumerate(zip(y1, y2)):
                #we added all that we needed to for this iteration, so break
                if len(p) == 2 * self.p_ and len(n) == 2 * self.n_:
                    break

                #update our newly 'labeled' samples.  Note that we are only 'labeling' a single sample
                #with each inner iteration.  We want to add 2p + 2n samples per outer iteration, but classifiers must agree

                if y1_i == y2_i == 1 and len(p) < self.p_:
                    p.append(i)

                if y2_i == y1_i == 0 and len(n) < self.n_:
                    n.append(i)


            #label the samples and remove thes newly added samples from U_
            y[[U_[x] for x in p]] = 1
            y[[U_[x] for x in n]] = 0

            L.extend([U_[x] for x in p])
            L.extend([U_[x] for x in n])

            #TODO: optimize these removals from U_
            #this is currently (2p + 2n)O(n)
            #and I think it can be reduced to O(n) rather easily
            for i in p: U_.pop(i)
            for i in n: U_.pop(i)

            #add new elements to U_
            add_counter = 0 #number we have added from U to U_
            num_to_add = len(p) + len(n)
            while add_counter != num_to_add and U:
                add_counter += 1
                U_.append(U.pop())


        #let's fit our final model
        self.clf1_.fit(X1[L], y[L])
        self.clf2_.fit(X2[L], y[L])


    #TODO: Move this outside of the class into a util file.
    def supports_proba(self, clf, x):
        """Checks if a given classifier supports the 'predict_proba' method, given a single vector x"""
        try:
            clf.predict_proba([x])
            return True
        except:
            return False
    
    def predict(self, X1, X2):
        y1 = self.clf1_.predict(X1)
        y2 = self.clf2_.predict(X2)

        proba_supported = self.supports_proba(self.clf1_, X1[0]) and self.supports_proba(self.clf2_, X2[0])

        #fill y_pred with -1 so we can identify the samples in which the classifiers failed to agree
        y_pred = np.asarray([-1] * X1.shape[0])

        for i, (y1_i, y2_i) in enumerate(zip(y1, y2)):
            if y1_i == y2_i:
                y_pred[i] = y1_i
            elif proba_supported:
                y1_probs = self.clf1_.predict_proba([X1[i]])[0]
                y2_probs = self.clf2_.predict_proba([X2[i]])[0]
                sum_y_probs = [prob1 + prob2 for (prob1, prob2) in zip(y1_probs, y2_probs)]
                max_sum_prob = max(sum_y_probs)
                y_pred[i] = sum_y_probs.index(max_sum_prob)

            else:
                #the classifiers disagree and don't support probability, so we guess
                y_pred[i] = random.randint(0, 1)


        #check that we did everything right
        assert not (-1 in y_pred)

        return y_pred


    def predict_proba(self, X1, X2):
        y_proba = np.full((X1.shape[0], 2), -1)

        y1_proba = self.clf1_.predict_proba(X1)
        y2_proba = self.clf2_.predict_proba(X2)

        for i, (y1_i_dist, y2_i_dist) in enumerate(zip(y1_proba, y2_proba)):
            y_proba[i][0] = (y1_i_dist[0] + y2_i_dist[0]) / 2
            y_proba[i][1] = (y1_i_dist[1] + y2_i_dist[1]) / 2

        _epsilon = 0.0001
        assert all(abs(sum(y_dist) - 1) <= _epsilon for y_dist in y_proba)
        return y_proba


In [62]:
# df_x0 = X_train[y_train == 0].sample(5)
# df_x1 = X_train[y_train == 1].sample(5)

In [63]:
# train = pd.concat([df_x0,df_x1])
# X_train.drop(train.index, inplace=True)
# train['label'] = y_train.loc[train.index]

In [64]:
tmp1 = set(y_train[y_train == 0].sample(5).index) | set(y_train[y_train == 1].sample(5).index)
tmp2 = set(y_train.index) - set(tmp1)
y_train.loc[tmp2] = -1

In [65]:
X_train.head()

Unnamed: 0,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,hasDomainLink,lengthyLinkDomain,linkwordscore,numberOfLinks,numwords_in_url,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
4189,6.396552,0.378788,0.333333,0.19697,0.090909,0,1,22,66,3,...,0.0,0.0,0.0,0.097095,0.0,0.0,0.0,0.0,0.0,0.0
5144,2.293814,0.434959,0.272358,0.186992,0.138211,0,1,33,246,5,...,0.0,0.0,0.0,0.140325,0.0,0.0,0.0,0.0,0.0,0.0
3914,0.253731,0.268657,0.0,0.0,0.0,0,0,57,67,0,...,0.0,0.0,0.0,0.69235,0.0,0.0,0.0,0.0,0.0,0.0
3918,2.67907,0.657658,0.387387,0.148649,0.067568,0,1,54,444,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2318,0.268145,0.047203,0.012238,0.006993,0.003497,0,0,6,572,8,...,0.0,0.0,0.0,0.0,0.0,0.315064,0.0,0.0,0.0,0.0


In [66]:
y_train[y_train != -1].index

Int64Index([7227, 180, 1377, 1836, 2066, 4897, 5355, 938, 1059, 1585], dtype='int64')

In [68]:
lg_co_clf = CoTrainingClassifier(LogisticRegression(), p=2, n=2)
lg_co_clf.fit(X_train[col_names1].reset_index(), X_train[col_names2].reset_index(), y_train.reset_index())
y_pred = lg_co_clf.predict(X_test[col_names1], X_test[col_names2])
print (classification_report(y_test, y_pred))