## Importing Libraries

In [30]:
# !pip install nltk
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics


import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Loading the Dataset

In [2]:
df = pd.read_csv('blogtext.csv', delimiter=',', nrows = 10000)

In [3]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


# Pre-processing the text


In [4]:
# removing punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
#storing the puntuation free text
df['text']= df['text'].apply(lambda x:remove_punctuation(x))
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",Info has been found 100 pages and ...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members Drewes...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoos Toolbar I can no...


In [5]:
# converting text to lowercase
df['text']= df['text'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info has been found 100 pages and ...
1,2059027,male,15,Student,Leo,"13,May,2004",these are the team members drewes...
2,2059027,male,15,Student,Leo,"12,May,2004",in het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks to yahoos toolbar i can no...


In [6]:
df['text']= df['text'].apply(lambda x: x.strip())
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info has been found 100 pages and 45 mb of pd...
1,2059027,male,15,Student,Leo,"13,May,2004",these are the team members drewes van der la...
2,2059027,male,15,Student,Leo,"12,May,2004",in het kader van kernfusie op aarde maak je e...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks to yahoos toolbar i can now capture the...


In [7]:
# stemming
porter = PorterStemmer()
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

df['text']= df['text'].apply(lambda x: stemSentence(x))
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info ha been found 100 page and 45 mb of pdf f...
1,2059027,male,15,Student,Leo,"13,May,2004",these are the team member drew van der laag ur...
2,2059027,male,15,Student,Leo,"12,May,2004",in het kader van kernfusi op aard maak je eige...
3,2059027,male,15,Student,Leo,"12,May,2004",test test
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thank to yahoo toolbar i can now captur the ur...


# Merging all the label columns together

In [8]:
df['age'] = df['age'].astype('str')

In [9]:
columns = ['gender', 'age', 'topic', 'sign']
def merge_labels(df, columns):

  merged = []

  for index in range(len(df)):
    merged_row = []
    for column in columns:
      merged_row.append(df[column][index])
    merged.append(merged_row)
  merged = pd.Series(merged)
  return merged

df['multilabel']  = merge_labels(df, columns)
df['multilabel'].head()


0                   [male, 15, Student, Leo]
1                   [male, 15, Student, Leo]
2                   [male, 15, Student, Leo]
3                   [male, 15, Student, Leo]
4    [male, 33, InvestmentBanking, Aquarius]
Name: multilabel, dtype: object

# Train Test Split

In [10]:
X = df['text']
y = df['multilabel']
print(X.shape)
print(y.shape)

(10000,)
(10000,)


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

# Vectorize the features

In [12]:
# instantiating
toNumeric = CountVectorizer(ngram_range=(1, 2),
                            max_df=0.75,
                            min_df=3,
                            stop_words='english')

# fitting the train
toNumeric.fit(X_train)
len(toNumeric.get_feature_names())



36159

In [13]:
# creating Document-term matrix for test and train
X_train_dtm = toNumeric.transform(X_train)
X_test_dtm = toNumeric.transform(X_test)

In [14]:
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(7500, 36159)
(2500, 36159)


# Create a dictionary to get the count of every label

In [15]:
# flatenning the inner lists into one list
flatenned_labels = pd.Series(np.concatenate(y))

In [16]:
# getting the value counts in the form of series
counted_series = pd.Series(flatenned_labels.value_counts())
# zipping the index and values into a dictionary
labels_count_dict = dict(zip(counted_series.index, counted_series.values))
labels_count_dict

{'13': 42,
 '14': 212,
 '15': 602,
 '16': 440,
 '17': 1185,
 '23': 253,
 '24': 655,
 '25': 386,
 '26': 234,
 '27': 1054,
 '33': 136,
 '34': 553,
 '35': 2315,
 '36': 1708,
 '37': 33,
 '38': 46,
 '39': 79,
 '40': 1,
 '41': 20,
 '42': 14,
 '43': 6,
 '44': 3,
 '45': 16,
 '46': 7,
 'Accounting': 4,
 'Aquarius': 571,
 'Aries': 4198,
 'Arts': 45,
 'Automotive': 14,
 'Banking': 16,
 'BusinessServices': 91,
 'Cancer': 504,
 'Capricorn': 215,
 'Communications-Media': 99,
 'Consulting': 21,
 'Education': 270,
 'Engineering': 127,
 'Fashion': 1622,
 'Gemini': 150,
 'HumanResources': 2,
 'Internet': 118,
 'InvestmentBanking': 70,
 'Law': 11,
 'LawEnforcement-Security': 10,
 'Leo': 301,
 'Libra': 491,
 'Marketing': 156,
 'Museums-Libraries': 17,
 'Non-Profit': 71,
 'Pisces': 454,
 'Publishing': 4,
 'Religion': 9,
 'Sagittarius': 1097,
 'Science': 63,
 'Scorpio': 971,
 'Sports-Recreation': 80,
 'Student': 1137,
 'Taurus': 812,
 'Technology': 2654,
 'Telecommunications': 2,
 'Virgo': 236,
 'female': 4

# Transform the labels (Multilabel Binarizer)

In [18]:
mlb = MultiLabelBinarizer(classes=sorted(labels_count_dict.keys()))
y_train = mlb.fit_transform(y_train)
y_test = mlb.fit_transform(y_test)

# Classification Models

In [36]:
clf = LogisticRegression(solver="lbfgs",
                         max_iter = 170)
clf = OneVsRestClassifier(clf)

In [37]:
clf.fit(X_train_dtm, y_train)

  "Label %s is present in all training examples." % str(classes[c])


OneVsRestClassifier(estimator=LogisticRegression(max_iter=170))

In [38]:
y_pred_class = clf.predict(X_test_dtm)

In [43]:
print('Accuracy: ',metrics.accuracy_score(y_test, y_pred_class))
print('F1 Score: ',metrics.f1_score(y_test, y_pred_class, average = 'micro'))
print('Precision: ',metrics.precision_score(y_test, y_pred_class, average = 'micro'))
print('Recall: ',metrics.recall_score(y_test, y_pred_class,average = 'micro'))

Accuracy:  0.3256
F1 Score:  0.6470049249799564
Precision:  0.7570356472795498
Recall:  0.5649


## Actual vs Predicted Classes

In [56]:
# actuals
pd.DataFrame(y_test[250:255], columns = list(mlb.classes_) )

Unnamed: 0,13,14,15,16,17,23,24,25,26,27,...,Scorpio,Sports-Recreation,Student,Taurus,Technology,Telecommunications,Virgo,female,indUnk,male
0,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [57]:
# predicted
pd.DataFrame(y_pred_class[250:255], columns = list(mlb.classes_) )

Unnamed: 0,13,14,15,16,17,23,24,25,26,27,...,Scorpio,Sports-Recreation,Student,Taurus,Technology,Telecommunications,Virgo,female,indUnk,male
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
