In [None]:
import numpy as np
import pandas as pd
import time, os
import string
import nltk
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn import metrics
from numpy import quantile, where, random
from baggingPU import BaggingClassifierPU
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
nltk.download('stopwords')

def text_preprocess(mess):
    nopunc = [char for char in mess if char not in string.punctuation]

    nopunc = ''.join(nopunc)
    nopunc = nopunc.lower()

    nostop = [word for word in nopunc.split() if word.lower() not in stopwords.words('english') and word.isalpha()]
    return nostop

df_raw = pd.read_csv("super23.csv", encoding='latin-1')
# df_raw = pd.read_excel('punny.xlsx')
# df_raw = df_raw.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
# df_raw = df_raw.drop(labels = ["Unnamed: 2"], axis = 1)
df_raw.columns = ["label", "text"]
# print (df_raw.head())

messages_train = df_raw.loc[0:52588, :] 
# messages_test = df_raw.loc[55461:55685, :]

def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels]) + 4
    empty_cell = " " * columnwidth
    print("    " + empty_cell, end=' ')
    for label in labels:
        print("%{0}s".format(columnwidth) % 'pred_' + label, end=" ")
    print()

    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % 'true_' + label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            if cell:
                print(cell, end=" ")
        print()

def random_undersampling(tmp_df, TARGET_LABEL):
    df_majority = tmp_df[tmp_df[TARGET_LABEL] == 0]
    df_minority = tmp_df[tmp_df[TARGET_LABEL] == 1]

    # Downsample majority class
    df_majority_downsampled = resample(df_majority,
                                       replace=False,              # sample without replacement
                                       n_samples=len(df_minority), # to match minority class
                                       random_state=None)        # reproducible results
    # Combine minority class with downsampled majority class
    df_downsampled = pd.concat([df_majority_downsampled, df_minority])

    print("Undersampling complete!")
    print(df_downsampled[TARGET_LABEL].value_counts())
    return df_downsampled

df_downsampled = random_undersampling(messages_train, 'label')
df_downsampled = df_downsampled.sample(frac=1) #Shuffle the data
df_downsampled = df_downsampled.reset_index() #Reset the index
df_downsampled = df_downsampled.drop(columns=['index']) # Drop original index col

df_downsampled.head()

'''
Lets make some negatives out of the positives by unlabeling a certain number of data points

'''
# Make a new df because we will need that for later
df = df_downsampled.copy()

#Separate cols from Train label
NON_LBL = [c for c in df.columns if c != 'label']
df["text"] = df[NON_LBL]
# print (X)
YY = df["text"]
print (YY)
df["text"] = df["text"].apply(text_preprocess)
df["text"] = df["text"].agg(lambda x: ' '.join(map(str, x)))
y = df['label']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(encoding = "latin-1", strip_accents = "unicode", ngram_range=(2, 2))
X_messages_tfidf = vec.fit_transform(df["text"]) #vec.fit_transform (train model) IS DIFFERENT THAN vec.transform
print("Train vector shape: ",X_messages_tfidf.shape)

# X_Test_messages_tfidf = vec.transform(messages_test["text"])
# print("Test vector shape: ", X_Test_messages_tfidf.shape)

# Save the original labels and indices
y_orig = y.copy()
original_idx = np.where(df_downsampled.label == 1)

# Here we are imputing 300 positives as negative
hidden_size = 300
y.loc[
    np.random.choice(
        y[y == 1].index,
        replace = False,
        size = hidden_size
    )
] = 0

# Now we have 910 unreliable "negatives" and 310 true positives
pd.Series(y).value_counts()

# print('- %d samples and %d features' % (X.shape))
print('- %d positive out of %d total before hiding labels' % (sum(df_downsampled.label), len(df_downsampled.label)))
print('- %d positive out of %d total after hiding labels' % (sum(y), len(y)))

print('Training bagging classifier...')
pu_start = time.perf_counter()
# Bagging with RandomForestClassifier
bc = BaggingClassifierPU(RandomForestClassifier(n_estimators=20, random_state=2019),
                         n_estimators = 50,
                         n_jobs = -1,
                         max_samples = sum(y)  # Each training sample will be balanced
                        )
# Bagging with SVM
# svc = SVC(C=10, kernel='rbf', gamma=0.4, probability=True)
# bc = BaggingClassifierPU(
#     base_estimator=svc, n_estimators=15)

bc.fit(X_messages_tfidf, y)
pu_end = time.perf_counter()
print('Done!')
print('Time:', pu_end - pu_start)

# print('---- {} ----'.format('PU Bagging'))
# print(print_cm(sklearn.metrics.confusion_matrix(y_test, bc.predict(X_Test_messages_tfidf)), labels=['negative', 'positive']))
# print('')
# print('Precision: ', precision_score(y_test, bc.predict(X_Test_messages_tfidf)))
# print('Recall: ', recall_score(y_test, bc.predict(X_Test_messages_tfidf)))
# print('Accuracy: ', accuracy_score(y_test, bc.predict(X_Test_messages_tfidf)))

Train vector shape:  (29264, 191884)
- 14632 positive out of 29264 total before hiding labels
- 14032 positive out of 29264 total after hiding labels
Training bagging classifier...


In [None]:
messages_test = df_raw.loc[52589:65736, :] # 55461:55685 55686:55910 55911:56135 56136:56360 56361:56585 56586:56810
#Separate cols from Test labels
NON_LBL_TEST = [c for c in messages_test.columns if c != 'label']
messages_test["text"] = messages_test[NON_LBL_TEST]
print (messages_test["text"])
messages_test["text"] = messages_test["text"].apply(text_preprocess)
messages_test["text"] = messages_test["text"].agg(lambda x: ' '.join(map(str, x)))
y_test = messages_test['label']
X_Test_messages_tfidf = vec.transform(messages_test["text"])
print("Test vector shape: ", X_Test_messages_tfidf.shape)
print('---- {} ----'.format('PU Bagging'))
test_pred = bc.predict(X_Test_messages_tfidf)
# print(print_cm(sklearn.metrics.confusion_matrix(y_test, bc.predict(X_Test_messages_tfidf)), labels=['negative', 'positive']))
print(print_cm(sklearn.metrics.confusion_matrix(y_test, test_pred), labels=['negative', 'positive']))

print('')

# print(metrics.classification_report(y_test, test_pred))
print("precision: ", metrics.precision_score(y_test, test_pred))
print("recall: ", metrics.recall_score(y_test, test_pred))
print('Accuracy: ', accuracy_score(y_test, test_pred))