In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import metrics
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
%matplotlib inline
import nltk
from nltk.corpus import stopwords 
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('blogtext.csv',error_bad_lines=False, engine="python")

Skipping line 16844: NULL byte detected. This byte cannot be processed in Python's native csv library at the moment, so please pass in engine='c' instead
Skipping line 19370: NULL byte detected. This byte cannot be processed in Python's native csv library at the moment, so please pass in engine='c' instead
Skipping line 31753: field larger than field limit (131072)
Skipping line 33676: NULL byte detected. This byte cannot be processed in Python's native csv library at the moment, so please pass in engine='c' instead
Skipping line 65976: NULL byte detected. This byte cannot be processed in Python's native csv library at the moment, so please pass in engine='c' instead
Skipping line 105502: unexpected end of data


In [3]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [4]:
df.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [5]:
df.shape

(105495, 7)

In [6]:
df.size

738465

In [7]:
df.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [8]:
df= df.head(3000)

# Preprocess text
#Preprocess values of text column

. Remove unwanted characters
. Convert text to lowercase
. Remove unwanted spaces
. Remove stopwords

In [9]:
# df.text = df.text.apply(lambda x: re.sub('[^A-Za-z]+','',x))
# Convert text to lowercase
df.text = df.text.apply(lambda x: x.lower())
# Strip unwanted spaces
df.text = df.text.apply(lambda x: x.strip())
# Remove stopwords
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
df.text = df.text.apply(lambda x: ''.join([word for word in x.split() if word not in stopwords]))

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\venkata
[nltk_data]     subbareddy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
df.text[6]

"somehowcoca-colawaysummingthingswell.early1970sflagshipjingle'i'dlikebuyworldcoke'(totune'i'dliketeachworldsing')prettymuchsummedpost-woodstockerawell.addmuchsales,catchytune.koreacoke'sthemeurllinkstopthinking.feelit.prettymuchsumslotkoreakoreans.(lookrelaxedcoupleis,stoppedthinkingstartedfeeling.)coursehighregardeducationmathlogicsuch,deepthinkmanykoreansreallylikeworkemotionanythingelse.westernersseemsublimatemoreso,leastdisplaydifferentway.maybescratchthat...westernerskoreansprobablyprettysimilar,contextdifferent.anyways,thinklosingkorearepeat'stopthinking,feelit.stopthinking,feelit.stopthinking,feelit.'everythingalright."

# Merge the label coulmns

Merge all the label columns together, so that we have all the tags together for a particular sentence

In [12]:
df['labels'] = df.apply(lambda row: [row['gender'], str(row['age']), row['topic'],row['sign']], axis =1)

In [13]:
df = df[['text','labels']]

In [14]:
df.head()

Unnamed: 0,text,labels
0,"infofound(+/-100pages,4.5mb.pdffiles)waituntil...","[male, 15, Student, Leo]"
1,teammembers:drewesvanderlaagurllinkmailruiyuxi...,"[male, 15, Student, Leo]"
2,hetkadervankernfusieopaarde:maakjeeigenwaterst...,"[male, 15, Student, Leo]"
3,testing!!!testing!!!,"[male, 15, Student, Leo]"
4,thanksyahoo!'stoolbar'capture'urlspopups...whi...,"[male, 33, InvestmentBanking, Aquarius]"


# Create training and testing data

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(df.text.values, df.labels.values, test_size=0.20, random_state = 42)

# Vectorize the data

# Create Bag of Words¶

. Use CountVectorizer
. Transform the traing and testing data

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [17]:
vectorizer.get_feature_names()[:5]

['00', '00 10', '00 11', '00 30', '00 31pm']

In [18]:
X_train_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Create a dictionary to get label counts

In [19]:
label_counts = dict()

for labels in df.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1

In [20]:
label_counts

{'male': 2272,
 '15': 299,
 'Student': 403,
 'Leo': 55,
 '33': 94,
 'InvestmentBanking': 70,
 'Aquarius': 286,
 'female': 728,
 '14': 74,
 'indUnk': 452,
 'Aries': 1699,
 '25': 110,
 'Capricorn': 77,
 '17': 147,
 'Gemini': 21,
 '23': 93,
 'Non-Profit': 46,
 'Cancer': 76,
 'Banking': 16,
 '37': 19,
 'Sagittarius': 113,
 '26': 43,
 '24': 334,
 'Scorpio': 243,
 '27': 86,
 'Education': 118,
 '45': 14,
 'Engineering': 119,
 'Libra': 313,
 'Science': 33,
 '34': 6,
 '41': 14,
 'Communications-Media': 14,
 'BusinessServices': 21,
 'Sports-Recreation': 75,
 'Virgo': 39,
 'Taurus': 76,
 'Arts': 2,
 'Pisces': 2,
 '44': 3,
 '16': 25,
 'Internet': 20,
 'Museums-Libraries': 2,
 'Accounting': 2,
 '39': 32,
 '35': 1607,
 'Technology': 1607}

# Multi label binarizer
Load a multilabel binarizer and fit it on the labels.

In [21]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=sorted(label_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [22]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

In [23]:
clf.fit(X_train_bow, y_train)

OneVsRestClassifier(estimator=LogisticRegression())

# Make predictions
Get predicted labels and scores

In [24]:
predicted_labels = clf.predict(X_test_bow)
predicted_scores = clf.decision_function(X_test_bow)

In [25]:
pred_inversed = mlb.inverse_transform(predicted_labels)
y_test_inversed = mlb.inverse_transform(y_test)

In [26]:
for i in range(5):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(pred_inversed[i])
    ))

Title:	pink'salreadydone,surephoenix,tho'.
True labels:	35,Aries,Technology,male
Predicted labels:	35,Aries,Technology,male


Title:	woohoo-she'lltomorrow!-probablymeansneedcleanplacebit-uhoh:(sincejen'sgoinggetcarwe'llableleastgrabfurnitureshe'llsomewheresit-timedperfectlypoolopeningweekend.i'mslavingawaywork,liepoolfightcicadas:)gothdtvcablehookup-unfortunatelyon-demandfunctionseemsbrokenmeantime,getearlytodaysomeonecomelookit.brightersidethingstechiemanagedblagtwo1udualpiii866machinesfree,i'llswapmobo'sreasonablecasehookhome-ideai'lluseyeti'mbettingthey'llmunchsetiunitsmeantimefigureout.....onelastthing-hellaltgrbuttonuskeyboards?-annoystypealt+0128euro(€)symbolremembercodeaccentedeetc.knowcouldchangekeymapukwhatever-altgroptionsomething?*endrant*
True labels:	25,Aries,Internet,male
Predicted labels:	35,Aries,Technology,male


Title:	actually,johnathan,me.calledlatelastnightsoundinggroggythankingsomething.couldbarelymakeinnanebabble,wonderingsayinghopedi'dguessname...
True labels:	3

# Calculate accuracy
. Accuracy
. F1-score
. Precision
. Recall

In [27]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def print_evaluation_scores(y_val, predicted):
    print('Accuracy score: ', accuracy_score(y_val, predicted))
    print('F1 score: ', f1_score(y_val, predicted, average='micro'))
    print('Average precision score: ', average_precision_score(y_val, predicted, average='micro'))
    print('Average recall score: ', recall_score(y_val, predicted, average='micro'))

In [28]:
print('Bag-of-words')
print_evaluation_scores(y_test, predicted_labels)

Bag-of-words
Accuracy score:  0.49166666666666664
F1 score:  0.6270417422867514
Average precision score:  0.43241486182927863
Average recall score:  0.5758333333333333
