In [1]:
import pandas as pd
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv("/content/drive/MyDrive/GL/NLP/blogtext.csv")

In [3]:
data.shape

(681284, 7)

In [6]:
data = data.head(5000)


In [7]:
data.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5000 non-null   int64 
 1   gender  5000 non-null   object
 2   age     5000 non-null   int64 
 3   topic   5000 non-null   object
 4   sign    5000 non-null   object
 5   date    5000 non-null   object
 6   text    5000 non-null   object
dtypes: int64(2), object(5)
memory usage: 273.6+ KB


Data preprocessing.

In [9]:
data.isna().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [10]:
data.text = data.text.apply(lambda x: x.lower())

In [11]:
data.text = data.text.apply(lambda x: x.strip())

In [12]:
data.text

0       info has been found (+/- 100 pages, and 4.5 mb...
1       these are the team members:   drewes van der l...
2       in het kader van kernfusie op aarde:  maak je ...
3                                  testing!!!  testing!!!
4       thanks to yahoo!'s toolbar i can now 'capture'...
                              ...                        
4995    so... i had another one of those dreams last n...
4996    mmm... strawberry tea for breakfast. tomorrow ...
4997    yay for a new layout!!  yeah, i know, i need t...
4998    ok, so i lied... fed up isn't playing friday n...
4999    well, today i went to church and talked to the...
Name: text, Length: 5000, dtype: object

In [13]:
stpWords  = set(stopwords.words('english'))

data.text = data.text.apply(lambda x: ' '.join([word for word in x.split() if word not in stpWords]))


In [14]:
import re

data.text = data.text.apply(lambda x: re.sub('[^A-Za-z]+', ' ', x))

In [15]:
data.text[0]


'info found pages mb pdf files wait untill team leader processed learns html '

Target/label merger and transformation

In [16]:
data['labels'] = data.apply(lambda row: [row['gender'], str(row['age']), row['topic'], row['sign']], axis=1)


In [17]:
data = data[['text','labels']]


In [18]:
data.head()

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo s toolbar capture urls popups whi...,"[male, 33, InvestmentBanking, Aquarius]"


In [19]:
X = data['text']
y = data['labels']

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=64)


Vectorisation

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(binary=True, ngram_range=(1, 2))
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

In [22]:
vect.get_feature_names()[:5]




['aaa', 'aaa take', 'aaa travel', 'aaaaack', 'aaaah']

In [23]:
label_counts = dict()

for labels in data.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1


In [24]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=sorted(label_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [25]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

In [26]:
clf.fit(X_train_vect, y_train)


OneVsRestClassifier(estimator=LogisticRegression())

In [27]:
predicted_labels = clf.predict(X_test_vect)
predicted_scores = clf.decision_function(X_test_vect)


In [28]:
pred_inversed = mlb.inverse_transform(predicted_labels)
y_test_inversed = mlb.inverse_transform(y_test)

In [31]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_evaluation_scores(y_val, predicted):
    print('Accuracy score: ', accuracy_score(y_val, predicted))
    print('F1 score: ', f1_score(y_val, predicted, average='micro'))
    print('Average precision score: ', average_precision_score(y_val, predicted, average='micro'))
    print('Average recall score: ', recall_score(y_val, predicted, average='micro'))

In [32]:
print('Bag-of-words')
print_evaluation_scores(y_test, predicted_labels)

Bag-of-words
Accuracy score:  0.5273333333333333
F1 score:  0.7329625789424076
Average precision score:  0.5727054474739439
Average recall score:  0.648
