In [46]:
import pandas as pd
import numpy as np
import os
# plots
import matplotlib.pyplot as plt
import seaborn as sns

import json

import re

from sklearn.preprocessing import StandardScaler
from sklearn import model_selection as ms
from sklearn.linear_model import LogisticRegression

import nltk
from nltk.stem.porter import PorterStemmer
import string
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

pd.options.display.max_columns = None

# Load dataset

In [47]:
# load course fulltext
path = 'course-cotrain-data/fulltext/course'
course_fulltext = []
for file in os.listdir(path):
    with open(path + "/" + file, encoding="utf8", errors='ignore') as f:
        course_fulltext.append(f.read())

In [48]:
# load non-course fulltext
path = 'course-cotrain-data/fulltext/non-course'
non_course_fulltext = []
for file in os.listdir(path):
    with open(path + "/" + file, encoding="utf8", errors='ignore') as f:
        non_course_fulltext.append(f.read())

In [49]:
# load course links
path = 'course-cotrain-data/inlinks/course'
course_inlinks = []
for file in os.listdir(path):
    with open(path + "/" + file, encoding="utf8", errors='ignore') as f:
        course_inlinks.append(f.read())

In [50]:
# load non-course links
path = 'course-cotrain-data/inlinks/non-course'
non_course_inlinks = []
for file in os.listdir(path):
    with open(path + "/" + file, encoding="utf8", errors='ignore') as f:
        non_course_inlinks.append(f.read())

# Concatenate datasets

In [51]:
course_data = pd.DataFrame(
    {'fulltext': course_fulltext,
     'inlinks': course_inlinks,
     'label': [1] * len(course_fulltext)
    })

course_data.head()

Unnamed: 0,fulltext,inlinks,label
0,<HTML><HEAD>\n<TITLE>301 Moved Permanently</TI...,Computer Graphics Seminar \n,1
1,<html>\n<head>\n<title>CS 537 - Introduction t...,CS 537 Section 1 (Marvin Solomon) \n,1
2,<HTML>\n<HEAD>\n<TITLE>CS 378 Course Descripti...,Object-Oriented Design and Programming \n,1
3,<html>\n<head>\n\n<title>CSE 590S (Systems Sem...,590 S \n590 S \n590 S \n590 S \nCSE 590S \n\nC...,1
4,<HTML>\n<head>\n<title>CSE 370 Home Page (Autu...,CSE 370 \nhere\n,1


In [52]:
non_course_data = pd.DataFrame(
    {'fulltext': non_course_fulltext,
     'inlinks': non_course_inlinks,
     'label': [0] * len(non_course_fulltext)
    })

non_course_data.head()

Unnamed: 0,fulltext,inlinks,label
0,"<title>Roy M. Jenevein, Jr.</title>\n\n<img sr...","Roy M. Jenevein, Jr. \n",0
1,<html>\n<head>\n<title>Stefan Savage</title>\n...,Stefan Savage\nStefan Savage\nStefan Savage\nS...,0
2,<TITLE>Jeremy Stenglein's Home Page </TITLE>\n...,"Stenglein, Jeremy \nJeremy Stenglein\nJeremy S...",0
3,<html>\n\n<head>\n<title>Kari Pulli's Home Pag...,Kari Pulli \nKari Pulli \nKari Pulli \nPulli \n,0
4,<html>\n<!-- index.html file -->\n\n<title>Nik...,Nikos P. Pitsianis \nNikos Pitsianis\n,0


In [53]:
data = pd.concat([course_data, non_course_data])

In [54]:
data.label.value_counts()

0    821
1    230
Name: label, dtype: int64

# Bag of words

In [55]:
stopwords = set(stopwords.words('english'))

In [56]:
# remove html tags
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [57]:
for row in range(0,len(data)):
    data.iloc[row,0] = strip_tags(data.iloc[row,0])

In [58]:
def text_stemming(data,col_name,stemmer):
    df = data.copy()
#     df[col_name] = df[col_name].apply(lambda x: re.sub('[^a-zA-Z]', ' ', str(x)))
    df[col_name] = df[col_name].apply(lambda x: str(x).lower().split())
    df[col_name] = df[col_name].apply(lambda x: [stemmer.stem(word) for word in x if word not in stopwords])
    df[col_name + "_final"] = df[col_name].apply(lambda x: ' '.join(x))
    return df

In [59]:
porter_stemmer = PorterStemmer()

data = text_stemming(data, 'fulltext', porter_stemmer)
data = text_stemming(data, 'inlinks', porter_stemmer)

In [60]:
cv = CountVectorizer()

bow_full = cv.fit_transform(data['fulltext_final']).toarray()
bow_link = cv.fit_transform(data['inlinks_final']).toarray()

## create dataframe with bag of words

In [61]:
y = data.label.reset_index(drop=True)
X1 = bow_full
X2 = bow_link

df1 = pd.DataFrame(X1)
df2 = pd.DataFrame(X2)

In [62]:
df = pd.concat([df1,df2], axis=1).reset_index(drop=True)

In [63]:
df['label'] = y

In [64]:
df.columns = list(range(0,19746)) + ['label']

In [65]:
df.shape

(1051, 19747)

In [66]:
col_names1=list(range(0,17733)) #fulltext
col_names2=list(range(17733,19746)) #links

## Classifier with full text

In [67]:
gnb = MultinomialNB()

In [68]:
df.loc[:,col_names1].shape

(1051, 17733)

In [69]:
y = df.label
X = df.loc[:,col_names1]

X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.25, random_state=42)

In [70]:
gnb = gnb.fit(X_train,y_train)
pred = gnb.predict(X_test)

In [71]:
accuracy_score(y_test, pred)

0.9467680608365019

## Classifier with full text (12 pages: 3 course, 9 non-course)

In [72]:
gnb = MultinomialNB()

In [73]:
y = df.label
X = df.loc[:,col_names1]

X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.25, random_state=42)

X_train['label'] = y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [74]:
df0 = X_train[X_train['label']==0].sample(9, random_state=42)
df1 = X_train[X_train.label==1].sample(3, random_state=42)

train = pd.concat([df0,df1])

train_x = train.loc[:,train.columns != 'label']
train_y = train.loc[:,train.columns == 'label']

In [75]:
gnb = gnb.fit(train_x,train_y.label)
pred = gnb.predict(X_test)

In [76]:
accuracy_score(y_test, pred)

0.8859315589353612

## Classifier links

In [77]:
gnb = MultinomialNB()

In [78]:
df.loc[:,col_names2].shape

(1051, 2013)

In [79]:
y = df.label
X = df.loc[:,col_names2]

X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.25, random_state=42)

In [80]:
gnb = gnb.fit(X_train,y_train)
pred = gnb.predict(X_test)

In [81]:
accuracy_score(y_test, pred)

0.5247148288973384

## Classifier with links (12 pages: 3 course, 9 non-course)

In [82]:
gnb = MultinomialNB()

In [83]:
y = df.label
X = df.loc[:,col_names2]

X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.25, random_state=42)

X_train['label'] = y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [84]:
df0 = X_train[X_train['label']==0].sample(9, random_state=42)
df1 = X_train[X_train.label==1].sample(3, random_state=42)

train = pd.concat([df0,df1])

train_x = train.loc[:,train.columns != 'label']
train_y = train.loc[:,train.columns == 'label']

In [85]:
gnb = gnb.fit(train_x,train_y.label)
pred = gnb.predict(X_test)

In [86]:
accuracy_score(y_test, pred)

0.7870722433460076

# Co-training

With U0

In [87]:
def co_training1(L, U, col_names1, col_names2, clf1, clf2,u=75, k=30,p=1,n=3, label='label', random_state=None):
    '''
    L - labeled data
    U - unlabeled data
    col_names1 - columns for clf1
    col_names2 - columns for clf2
    clf1 - classificator1
    clf2 - classificator2
    u - number of unlabeled data for training
    k - number of iteration
    p - number of positive examles to add label per iteration
    n - number of negative examles to add label per iteration
    label - name of column in L whit label
    '''
    
    U0 = U.sample(u, random_state= random_state)
    U.drop(U0.index, inplace=True)
    while k > 0:
        print("iteration: ", 30 - k)
        # step 1: Use L to train a classifier h1 that considers only the x1 portion of x
        clf1 = clf1.fit(L[col_names1],  L[label])
        
        # step 2: Use L, to train a classifier h2 that considers only the x2 portion of x
        clf2 = clf2.fit(L[col_names2], L[label])
        
        # step 3: Allow hl to label p positive and n negative examples from U’
        predicted_prob1 = clf1.predict_proba(U0[col_names1])
        top_positive1 = predicted_prob1[:,1].argsort()[-p:]
        top_negative1 = predicted_prob1[:,0].argsort()[-n:]
        
        # step 4: Allow hl to label p positive and n negative examples from U’
        predicted_prob2 = clf2.predict_proba(U0[col_names2])
        top_positive2 = predicted_prob2[:,1].argsort()[-p:]
        top_negative2 = predicted_prob2[:,0].argsort()[-n:]    
        
        # step 5: Add these self-labeled examples to L
        positive_ind = U0.iloc[np.unique(np.concatenate((top_positive1,top_positive2))),:].index
        negative_ind = U0.iloc[np.unique(np.concatenate((top_negative1,top_negative2))),:].index
        self_labeled = U0.loc[np.unique(np.concatenate((positive_ind,negative_ind))),:]
        U0.drop(self_labeled.index, inplace=True)
        self_labeled.loc[positive_ind, label] = 1
        self_labeled.loc[negative_ind, label] = 0
        L = pd.concat([L, self_labeled])
    
        # step 6: Randomly choose 2p + 2n examples from U to replenish U_
        new_unlabeled = U.sample(2 * n + 2 * p, random_state=random_state)
        U.drop(new_unlabeled.index, inplace=True)
        U0 = U0.append(new_unlabeled)

        k -= 1
    
    return [clf1,clf2]

Without U0

In [163]:
def co_training(L, U, col_names1, col_names2, clf1, clf2,u=75, k=30,p=1,n=3, label='label', random_state=None):
    '''
    L - labeled data
    U - unlabeled data
    col_names1 - columns for clf1
    col_names2 - columns for clf2
    clf1 - classificator1
    clf2 - classificator2
    u - number of unlabeled data for training
    k - number of iteration
    p - number of positive examles to add label per iteration
    n - number of negative examles to add label per iteration
    label - name of column in L whit label
    '''
    
#     U0 = U.sample(u, random_state= random_state)
#     U.drop(U0.index, inplace=True)
    k0 = k
    while k > 0:
        print("iteration: ", k0 - k)
        # step 1: Use L to train a classifier h1 that considers only the x1 portion of x
        clf1 = clf1.fit(L[col_names1],  L[label])
        
        # step 2: Use L, to train a classifier h2 that considers only the x2 portion of x
        clf2 = clf2.fit(L[col_names2], L[label])
        
        # step 3: Allow hl to label p positive and n negative examples from U
        predicted_prob1 = clf1.predict_proba(U[col_names1])
        top_positive1 = predicted_prob1[:,1].argsort()[-p:]
        top_negative1 = predicted_prob1[:,0].argsort()[-n:]
        
        # step 4: Allow hl to label p positive and n negative examples from U
        predicted_prob2 = clf2.predict_proba(U[col_names2])
        top_positive2 = predicted_prob2[:,1].argsort()[-p:]
        top_negative2 = predicted_prob2[:,0].argsort()[-n:]    
        
        # step 5: Add these self-labeled examples to L
        positive_ind = U.iloc[np.unique(np.concatenate((top_positive1,top_positive2))),:].index
        negative_ind = U.iloc[np.unique(np.concatenate((top_negative1,top_negative2))),:].index
        self_labeled = U.loc[np.unique(np.concatenate((positive_ind,negative_ind))),:]
        U.drop(self_labeled.index, inplace=True)
        self_labeled.loc[positive_ind, label] = 1
        self_labeled.loc[negative_ind, label] = 0
        L = pd.concat([L, self_labeled])
    
        # step 6: Randomly choose 2p + 2n examples from U to replenish U_
#         new_unlabeled = U.sample(2 * n + 2 * p, random_state=random_state)
#         U.drop(new_unlabeled.index, inplace=True)
#         U0 = U0.append(new_unlabeled)

        k -= 1
    
    return [clf1,clf2]

Train until there are unlabeled data

In [357]:
def co_training2(L, U, col_names1, col_names2, clf1, clf2, label='label', random_state=None, treshhold=0.8):
    '''
    L - labeled data
    U - unlabeled data
    col_names1 - columns for clf1
    col_names2 - columns for clf2
    clf1 - classificator1
    clf2 - classificator2
    u - number of unlabeled data for training
    k - number of iteration
    p - number of positive examles to add label per iteration
    n - number of negative examles to add label per iteration
    label - name of column in L whit label
    '''
    
    while len(U) > 0:
        
        # step 1: Use L to train a classifier h1 that considers only the x1 portion of x
        clf1 = clf1.fit(L[col_names1],  L[label])
        
        # step 2: Use L, to train a classifier h2 that considers only the x2 portion of x
        clf2 = clf2.fit(L[col_names2], L[label])
        
        # step 3: Allow hl to label p positive and n negative examples from U
        predicted_prob1 = clf1.predict_proba(U[col_names1])
        top_positive1 = predicted_prob1[:,1].argsort()[-1:] if predicted_prob1[:,1].argsort()[-1:] > treshhold else [] 
        top_negative1 = predicted_prob1[:,0].argsort()[-1:] if predicted_prob1[:,0].argsort()[-1:] > treshhold else []
        
        # step 4: Allow hl to label p positive and n negative examples from U
        predicted_prob2 = clf2.predict_proba(U[col_names2]) 
        top_positive2 = predicted_prob2[:,1].argsort()[-1:] if predicted_prob2[:,1].argsort()[-1:] > treshhold else []
        top_negative2 = predicted_prob2[:,0].argsort()[-1:] if predicted_prob2[:,0].argsort()[-1:] > treshhold else []
        
        # step 5: Add these self-labeled examples to L
        if(np.unique(np.concatenate((top_positive1,top_positive2, top_negative1, top_negative2))).size == 0):
            print("ehhhh")
            return [clf1,clf2]
        positive_ind = U.iloc[np.unique(np.concatenate((top_positive1,top_positive2))),:].index
        negative_ind = U.iloc[np.unique(np.concatenate((top_negative1,top_negative2))),:].index
        self_labeled = U.loc[np.unique(np.concatenate((positive_ind,negative_ind))),:]
        U.drop(self_labeled.index, inplace=True)
        self_labeled.loc[positive_ind, label] = 1
        self_labeled.loc[negative_ind, label] = 0
        L = pd.concat([L, self_labeled])
    
        # step 6: Randomly choose 2p + 2n examples from U to replenish U_
#         new_unlabeled = U.sample(2 * n + 2 * p, random_state=random_state)
#         U.drop(new_unlabeled.index, inplace=True)
#         U0 = U0.append(new_unlabeled)

    
    return [clf1,clf2]

Combined classification

In [159]:
class CombinedClassifier:
    
    def __init__(self,clf1, clf2, cols1, cols2):
        self.clf1 = clf1
        self.clf2 = clf2
        self.cols1 = cols1
        self.cols2 = cols2
        
    def pr(self, df):
        result = []
        for ind, row in df.iterrows():
            prob0 = self.clf1.predict_proba([df.loc[ind,self.cols1]])[0][0] * self.clf2.predict_proba([df.loc[ind,self.cols2]])[0][0]  
            prob1 = self.clf1.predict_proba([df.loc[ind,self.cols1]])[0][1] * self.clf2.predict_proba([df.loc[ind,self.cols2]])[0][1]
#             print(self.clf1.predict_proba([df.loc[ind,self.cols1]]), " ", self.clf1.predict_proba([df.loc[ind,self.cols1]])[0][0], " ", self.clf2.predict_proba([df.loc[ind,self.cols2]]))
#             print(prob0, " ", prob1)
            res = 0 if prob0 > prob1 else 1
            result.append(res)
        return result

## Test

test train split

In [209]:
y = df.label
X = df[df.columns.drop('label')]

X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.25, random_state=42)

X_train['label'] = y_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


labeled data, 12 pages (3 course, 9 non course)

In [210]:
train0 = X_train[X_train.label==0].sample(9)#, random_state= 42)

train1 = X_train[X_train.label==1].sample(3)#, random_state=42)

train = pd.concat([train0,train1])

In [193]:
train.label

405    0
685    0
634    0
953    0
724    0
879    0
360    0
379    0
752    0
225    1
171    1
147    1
Name: label, dtype: int64

In [194]:
gnb1 = MultinomialNB()
gnb2 = MultinomialNB()

In [195]:
from sklearn.ensemble import RandomForestClassifier

clf1 = RandomForestClassifier(n_estimators = 100, max_depth = 8, random_state=42)
clf2 = RandomForestClassifier(n_estimators = 100, max_depth = 8, random_state=42)

### Multinomial bayes co-training

In [196]:
new_1, new_2 = co_training(L=train,U=X_train.loc[X_train.index.drop(train.index),list(range(0,19746))], 
                           col_names1=list(range(0,17733)),
                           col_names2=list(range(17733,19746)), clf1=gnb1, clf2=gnb2, k=30)

iteration:  0
iteration:  1
iteration:  2
iteration:  3
iteration:  4
iteration:  5
iteration:  6
iteration:  7
iteration:  8
iteration:  9
iteration:  10
iteration:  11
iteration:  12
iteration:  13
iteration:  14
iteration:  15
iteration:  16
iteration:  17
iteration:  18
iteration:  19
iteration:  20
iteration:  21
iteration:  22
iteration:  23
iteration:  24
iteration:  25
iteration:  26
iteration:  27
iteration:  28
iteration:  29


In [197]:
pred = new_1.predict(X_test[col_names1])

In [198]:
print("text calssifier accuracy:" ,accuracy_score(y_test, pred))

text calssifier accuracy: 0.8973384030418251


In [199]:
pred = new_2.predict(X_test[col_names2])
print("link calssifier accuracy:" ,accuracy_score(y_test, pred))

link calssifier accuracy: 0.3726235741444867


In [200]:
cl = CombinedClassifier(new_1,new_2,col_names1, col_names2)

In [201]:
pred = cl.pr(X_test)

In [202]:
print("combined calssifier accuracy:" ,accuracy_score(y_test, pred))

combined calssifier accuracy: 0.8631178707224335


## Random forest

In [211]:
new_1, new_2 = co_training(L=train,U=X_train.loc[X_train.index.drop(train.index),list(range(0,19746))], 
                           col_names1=list(range(0,17733)),
                           col_names2=list(range(17733,19746)), clf1=clf1, clf2=clf2, k=30)

iteration:  0
iteration:  1
iteration:  2
iteration:  3
iteration:  4
iteration:  5
iteration:  6
iteration:  7
iteration:  8
iteration:  9
iteration:  10
iteration:  11
iteration:  12
iteration:  13
iteration:  14
iteration:  15
iteration:  16
iteration:  17
iteration:  18
iteration:  19
iteration:  20
iteration:  21
iteration:  22
iteration:  23
iteration:  24
iteration:  25
iteration:  26
iteration:  27
iteration:  28
iteration:  29


In [212]:
pred = new_1.predict(X_test[col_names1])
print("fulltext calssifier accuracy:" ,accuracy_score(y_test, pred))

fulltext calssifier accuracy: 0.8365019011406845


In [213]:
pred = new_2.predict(X_test[col_names2])
print("link calssifier accuracy:" ,accuracy_score(y_test, pred))

link calssifier accuracy: 0.779467680608365


In [214]:
cl = CombinedClassifier(new_1,new_2,col_names1, col_names2)

In [215]:
pred = cl.pr(X_test)

In [216]:
print("combined calssifier accuracy:" ,accuracy_score(y_test, pred))

combined calssifier accuracy: 0.8022813688212928
