<a href="https://colab.research.google.com/github/pretty-neat-org/siap/blob/main/GloVe_good_data_split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## GloVe embeddings + classifier

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
!ln -s /content/gdrive/My\ Drive/ /mydrive
!ls /mydrive/SIAP

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
ln: failed to create symbolic link '/mydrive/My Drive': File exists
config.json	   file.csv			in_domain_train.tsv
data.csv	   finalized_model_10k_SVE.sav	pytorch_model.bin
data_mini_10k.csv  finalized_model.sav		serialized
file5m.zip	   glove.6B.300d.txt


In [2]:
import string

import numpy as np
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.svm import SVC
from sklearn.utils import resample
import pandas as pd
from collections import Counter

# TODO: Dotrenirati glove

not_found = []


class GloveVectorizer:
    def __init__(self):
        # load in pre-trained word vectors
        print('Loading word vectors...')
        word2vec = {}
        embedding = []
        idx2word = []
        with open('gdrive/MyDrive/SIAP/glove.6B.300d.txt', encoding="utf8") as f:
            # is just a space-separated text file in the format:
            # word vec[0] vec[1] vec[2] ...
            for line in f:
                values = line.split()
                word = values[0]
                vec = np.asarray(values[1:], dtype='float32')
                word2vec[word] = vec
                embedding.append(vec)
                idx2word.append(word)
        print('Found %s word vectors.' % len(word2vec))

        # save for later
        self.word2vec = word2vec
        self.embedding = np.array(embedding)
        self.word2idx = {v: k for k, v in enumerate(idx2word)}
        self.V, self.D = self.embedding.shape

    def fit(self, data):
        pass

    def transform(self, data):
        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        for sentence in data:
            try:
                tokens = sentence.lower().split()
            except:
                pass
            vecs = []
            for word in tokens:
                if word in self.word2vec:
                    vec = self.word2vec[word]
                    vecs.append(vec)
                else:
                    not_found.append(word)

            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n += 1
        print("Number of samples with no words found: %s / %s" % (emptycount, len(data)))
        return X

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)


df = pd.read_csv('gdrive/MyDrive/SIAP/file.csv')
print(df.columns)

df['index'] = df.index

print(df['controversiality'].value_counts())

df_majority = df[df.controversiality == 0]
df_minority = df[df.controversiality == 1]

df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   # sample without replacement # stavio sam tretno na true, da bi nastavio dalje
                                   n_samples=10744,  # to match minority class
                                   random_state=123)

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_minority, df_majority_downsampled])

# Display new class counts
print(df_upsampled.controversiality.value_counts())

df = df_upsampled

# ukloni linkove punctiation i kineski
df['body'] = df['body'].str.replace(r'http\S+', '')
df['body'] = df['body'].str.replace(r'[^\w\s]', '')
df['body'] = df['body'].str.replace(r"([^\x00-\x7F])+", '')
# # remove numbers
# df['body'] = df['body'].str.replace('\d+', '')
# df['body'] = df['body'].str.replace('_', '')
# df['body'] = df['body'].str.replace('\s+', '')


sentences = df.body.values
labels = df.controversiality.values

train_inputs, train_val_inputs, train_labels, train_val_labels = train_test_split(sentences, labels,
                                                                                    random_state=2018, test_size=0.3)
test_inputs, validation_inputs, test_labels, validation_labels = train_test_split(train_val_inputs, train_val_labels,
                                                                                    random_state=2018, test_size=0.5)

vectorizer = GloveVectorizer()

Xtrain = vectorizer.fit_transform(train_inputs)
Ytrain = train_labels

Xtest = vectorizer.transform(validation_inputs)
Ytest = validation_labels

transformer = Normalizer().fit(Xtrain)
Xtrain = transformer.transform(Xtrain)
Xtest = transformer.transform(Xtest)

print(Counter(not_found).most_common(50))

clfs = {
    #'mnb': MultinomialNB(),
    'gnb': GaussianNB(),
    'svm1': SVC(kernel='linear'),
    'svm2': SVC(kernel='rbf'),
    'svm3': SVC(kernel='sigmoid'),
    'mlp1': MLPClassifier(),
    'mlp2': MLPClassifier(hidden_layer_sizes=[100, 100]),
    'ada': AdaBoostClassifier(),
    'dtc': DecisionTreeClassifier(),
    'rfc': RandomForestClassifier(),
    'gbc': GradientBoostingClassifier(),
    'lr': LogisticRegression()
}

f1_scores = dict()
for clf_name in clfs:
    print(clf_name)
    clf = clfs[clf_name]
    clf.fit(Xtrain, Ytrain)
    y_pred = clf.predict(Xtest)
    f1_scores[clf_name] = f1_score(y_pred, Ytest)


Index(['created_utc', 'ups', 'subreddit_id', 'link_id', 'name', 'score_hidden',
       'author_flair_css_class', 'author_flair_text', 'subreddit', 'id',
       'removal_reason', 'gilded', 'downs', 'archived', 'author', 'score',
       'retrieved_on', 'body', 'distinguished', 'edited', 'controversiality',
       'parent_id'],
      dtype='object')
0    489256
1     10744
Name: controversiality, dtype: int64
1    10744
0    10744
Name: controversiality, dtype: int64
Loading word vectors...
Found 400000 word vectors.
Number of samples with no words found: 285 / 15041
Number of samples with no words found: 49 / 3224
[('youve', 126), ('shouldnt', 111), ('downvoted', 79), ('subreddit', 77), ('werent', 71), ('theyve', 62), ('downvote', 59), ('theyll', 54), ('downvotes', 51), ('itll', 46), ('lmao', 43), ('upvote', 39), ('wouldve', 37), ('idk', 32), ('downvoting', 31), ('sjws', 28), ('theyd', 28), ('upvoted', 27), ('botrautomoderatorcommentsq11puwhat_is_automoderator', 25), ('gtthe', 24), ('hah



mlp2




ada
dtc
rfc
gbc
lr


In [3]:
print(f1_scores)

{'gnb': 0.616871245756072, 'svm1': 0.6326224091318715, 'svm2': 0.6346389228886169, 'svm3': 0.5202520252025202, 'mlp1': 0.5677335919818947, 'mlp2': 0.5816112608565439, 'ada': 0.6054216867469879, 'dtc': 0.5551819015591563, 'rfc': 0.5978987583572111, 'gbc': 0.6268115942028986, 'lr': 0.6365539934190846}


Grid search:

In [4]:
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100], 
              'gamma': [1, 0.1],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(Xtrain, Ytrain)
# print best parameter after tuning
print(grid.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.630, total= 1.3min
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.631, total= 1.3min
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.6min remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.619, total= 1.3min
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.636, total= 1.3min
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.617, total= 1.3min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.539, total= 1.4min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.553, total= 1.3min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.546, total= 1.3min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ........ C=0.1, gamma=0.1, kernel=rbf, score=0.533, total= 1.3min
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 61.7min finished


{'C': 1, 'gamma': 1, 'kernel': 'rbf'}


In [5]:
from sklearn.metrics import classification_report
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
grid_predictions = grid.predict(Xtest)
  
# print classification report
print(classification_report(Ytest, grid_predictions))

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
              precision    recall  f1-score   support

           0       0.63      0.62      0.63      1601
           1       0.63      0.64      0.64      1623

    accuracy                           0.63      3224
   macro avg       0.63      0.63      0.63      3224
weighted avg       0.63      0.63      0.63      3224



In [6]:
Xtest = vectorizer.transform(test_inputs)
Ytest = test_labels

Xtest = transformer.transform(Xtest)
grid_predictions = grid.predict(Xtest)
  
# print classification report
print(classification_report(Ytest, grid_predictions))

Number of samples with no words found: 48 / 3223
              precision    recall  f1-score   support

           0       0.66      0.63      0.65      1670
           1       0.62      0.65      0.63      1553

    accuracy                           0.64      3223
   macro avg       0.64      0.64      0.64      3223
weighted avg       0.64      0.64      0.64      3223

