In [193]:
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',40)
pd.set_option('display.max_rows',150)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

### 1. Load the dataset. As the dataset is large, use fewer rows.

In [194]:
zip_file_path = 'C:/Users/HP\OneDrive/Desktop/Data Science/NLP/blogtext.zip'
csv_file_name = 'blogtext.csv'

with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    with zip_file.open(csv_file_name) as csv_file:
        df1 = pd.read_csv(csv_file, delimiter=',')

In [195]:
df2 = df1.copy()

In [196]:
df2.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [197]:
df2.shape

(681284, 7)

In [198]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  int64 
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [199]:
df = df2.head(3000)

---

### 2. Preprocess rows of the “text” column
#### a. Remove unwanted characters
#### b. Convert text to lowercase
#### c. Remove unwanted spaces
#### d. Remove stopwords

In [200]:
# Example of removing non-alphabetic characters and numbers from a string
import re
s = 'abc123!#@'
output = re.sub(r'[^A-Za-z]+', '', s)
print(output)

abc


In [201]:
# Stopwords list 
from nltk.corpus import stopwords
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [202]:
df['text'] = df['text'].apply(lambda x: re.sub(r'[^A-Za-z]+', ' ', str(x)))

In [203]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [204]:
df['text'] = df['text'].apply(lambda x: x.strip())

In [205]:
stopwords_1 = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords_1]))

In [206]:
# Looking at one record
df['text'][2005]

'ben new radio show occasional tv show yet come name suggestions welcomed please watch us cable channel six december th listen radio show may starting early dec nd find us station tune www richmondfreeradio org woo hoo'

---

### 3. As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence
#### a. Label columns to merge: “gender”, “age”, “topic”, “sign”
#### b. After completing the previous step, there should be only two columns in your data frame i.e. “text” and “labels” 

In [207]:
df.head(2)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...


In [208]:
df['labels'] = df.apply(lambda row: [row['gender'], str(row['age']), row['topic'], row['sign']], axis=1)

In [209]:
df = df[['text','labels']]

In [210]:
df.head()

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"[male, 15, Student, Leo]"
1,team members drewes van der laag urllink mail ...,"[male, 15, Student, Leo]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, 15, Student, Leo]"
3,testing testing,"[male, 15, Student, Leo]"
4,thanks yahoo toolbar capture urls popups means...,"[male, 33, InvestmentBanking, Aquarius]"


---

### 4. Separate features and labels, and split the data into training and testing

In [211]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['labels'], test_size=0.2, random_state=42)

---

### 5. Vectorize the features (5 points)
#### a. Create a Bag of Words using count vectorizer
* #####  i. Use ngram_range=(1, 2)
* #####  ii. Vectorize training and testing features
#### b. Print the term-document matrix 

In [212]:
model = CountVectorizer(binary=True, ngram_range=(1,2))
model.fit(df['text'])

In [213]:
model_train_dtm = model.transform(X_train)
model_test_dtm = model.transform(X_test)

In [214]:
# Document-Term Matrix for Training set
pd.DataFrame(model_train_dtm.toarray(), columns=model.get_feature_names_out())

Unnamed: 0,aa,aa anger,aa compared,aa nice,aaa,aaa take,aaa travel,aaaaaah,aaaaack,aaaah,aaaah wisdom,aaaahh,aaagh,aaagh pero,aaah,aaah eat,aaah hafta,aaahhh,aaahhh cryptic,aaarrrggghhhhhhhhgggghhhhhh,...,zorro,zorro traffic,zovakware,zovakware lord,zua,zua watching,zun,zun charles,zun personally,zuo,zuo le,zza,zza dong,zzz,zzzexy,zzzexy pathetic,zzzzz,zzzzz drop,zzzzzzz,zzzzzzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2396,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2398,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [215]:
# Document-Term Matrix for Test set
pd.DataFrame(model_test_dtm.toarray(), columns=model.get_feature_names_out())

Unnamed: 0,aa,aa anger,aa compared,aa nice,aaa,aaa take,aaa travel,aaaaaah,aaaaack,aaaah,aaaah wisdom,aaaahh,aaagh,aaagh pero,aaah,aaah eat,aaah hafta,aaahhh,aaahhh cryptic,aaarrrggghhhhhhhhgggghhhhhh,...,zorro,zorro traffic,zovakware,zovakware lord,zua,zua watching,zun,zun charles,zun personally,zuo,zuo le,zza,zza dong,zzz,zzzexy,zzzexy pathetic,zzzzz,zzzzz drop,zzzzzzz,zzzzzzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
597,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
598,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


---

### 6. Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label.

In [216]:
df['labels'].values

array([list(['male', '15', 'Student', 'Leo']),
       list(['male', '15', 'Student', 'Leo']),
       list(['male', '15', 'Student', 'Leo']), ...,
       list(['male', '35', 'Technology', 'Aries']),
       list(['male', '35', 'Technology', 'Aries']),
       list(['male', '35', 'Technology', 'Aries'])], dtype=object)

In [217]:
a = dict()
for i in df['labels'].values:
    for j in i:
        if j in a:
            a[j] += 1
        else:
            a[j] = 1

In [218]:
print(a)

{'male': 2272, '15': 299, 'Student': 403, 'Leo': 55, '33': 94, 'InvestmentBanking': 70, 'Aquarius': 286, 'female': 728, '14': 74, 'indUnk': 452, 'Aries': 1699, '25': 110, 'Capricorn': 77, '17': 147, 'Gemini': 21, '23': 93, 'Non-Profit': 46, 'Cancer': 76, 'Banking': 16, '37': 19, 'Sagittarius': 113, '26': 43, '24': 334, 'Scorpio': 243, '27': 86, 'Education': 118, '45': 14, 'Engineering': 119, 'Libra': 313, 'Science': 33, '34': 6, '41': 14, 'Communications-Media': 14, 'BusinessServices': 21, 'Sports-Recreation': 75, 'Virgo': 39, 'Taurus': 76, 'Arts': 2, 'Pisces': 2, '44': 3, '16': 25, 'Internet': 20, 'Museums-Libraries': 2, 'Accounting': 2, '39': 32, '35': 1607, 'Technology': 1607}


---

### 7. Transform the labels. Convert your train and test labels using MultiLabelBinarizer

In [219]:
mlb = MultiLabelBinarizer()
mlb.fit(df['labels'])

In [220]:
train_binary_matrix = mlb.transform(y_train)
test_binary_matrix = mlb.transform(y_test)

In [221]:
# Training set Binary Labels
pd.DataFrame(train_binary_matrix, columns=mlb.classes_)

Unnamed: 0,14,15,16,17,23,24,25,26,27,33,34,35,37,39,41,44,45,Accounting,Aquarius,Aries,...,Engineering,Gemini,Internet,InvestmentBanking,Leo,Libra,Museums-Libraries,Non-Profit,Pisces,Sagittarius,Science,Scorpio,Sports-Recreation,Student,Taurus,Technology,Virgo,female,indUnk,male
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
2396,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
2397,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
2398,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1


In [222]:
# Test set Binary Labels
pd.DataFrame(test_binary_matrix, columns=mlb.classes_)

Unnamed: 0,14,15,16,17,23,24,25,26,27,33,34,35,37,39,41,44,45,Accounting,Aquarius,Aries,...,Engineering,Gemini,Internet,InvestmentBanking,Leo,Libra,Museums-Libraries,Non-Profit,Pisces,Sagittarius,Science,Scorpio,Sports-Recreation,Student,Taurus,Technology,Virgo,female,indUnk,male
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
596,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
597,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
598,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1


---

### 8. Choose a classifier. Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on every label 

##### Limited-memory Broyden–Fletcher–Goldfarb–Shanno (lbfgs): This solver is suitable for optimizing smooth functions. It is based on the BFGS quasi-Newton method and is particularly well-suited for high-dimensional optimization problems. When using logistic regression for multi-class classification, the lbfgs solver can be a good choice.

In [223]:
classifier = OneVsRestClassifier(LogisticRegression(solver='lbfgs'))
classifier.fit(model_train_dtm, train_binary_matrix)

---

### 10. Print true label and predicted label for any five examples

In [224]:
y_pred = classifier.predict(model_test_dtm)
scores_pred = classifier.decision_function(model_test_dtm)

In [225]:
# Predictions of Binary Matrix made by classifier.
pd.DataFrame(y_pred, columns=mlb.classes_)

Unnamed: 0,14,15,16,17,23,24,25,26,27,33,34,35,37,39,41,44,45,Accounting,Aquarius,Aries,...,Engineering,Gemini,Internet,InvestmentBanking,Leo,Libra,Museums-Libraries,Non-Profit,Pisces,Sagittarius,Science,Scorpio,Sports-Recreation,Student,Taurus,Technology,Virgo,female,indUnk,male
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
596,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
597,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
598,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1


In [226]:
# These scores represent the signed distance of the samples to the hyperplane and can be used for tasks like setting custom decision thresholds.
pd.DataFrame(scores_pred, columns=mlb.classes_)

Unnamed: 0,14,15,16,17,23,24,25,26,27,33,34,35,37,39,41,44,45,Accounting,Aquarius,Aries,...,Engineering,Gemini,Internet,InvestmentBanking,Leo,Libra,Museums-Libraries,Non-Profit,Pisces,Sagittarius,Science,Scorpio,Sports-Recreation,Student,Taurus,Technology,Virgo,female,indUnk,male
0,-6.938464,-4.488257,-5.996774,-6.246637,-5.295571,-4.355986,-5.759553,-7.079428,-6.025243,-6.784285,-7.927040,2.727975,-7.202020,-6.767622,-7.918543,-10.222161,-8.448814,-10.652349,-4.518251,2.821318,...,-6.366451,-7.395497,-8.094435,-7.625964,-6.341085,-5.152367,-11.017349,-6.074989,-11.432335,-5.131364,-5.886954,-4.722273,-7.467757,-4.165655,-5.403698,2.727975,-6.801501,-3.287566,-3.810959,3.287566
1,-7.104797,-4.448398,-6.983674,-4.550293,-6.719967,-6.014873,-2.007809,-6.122364,-7.832819,-4.859449,-7.676580,-4.260374,-8.468597,-7.533787,-8.715983,-9.956817,-7.676975,-10.105096,-4.907403,-1.757883,...,-5.966789,-5.652551,-4.532109,-5.919306,-5.417323,-4.444031,-10.840113,-5.733326,-11.288503,-5.411152,-7.610929,-6.268449,-6.052055,-2.720769,-6.068228,-4.260374,-7.435898,-3.208624,-4.095976,3.208624
2,-7.033414,-4.690245,-5.998854,-5.421558,-6.134701,-4.755550,-5.359340,-5.966915,-7.487885,-6.863481,-7.772065,1.385552,-7.381645,-6.781895,-7.787931,-10.283487,-8.083550,-10.531335,-5.417877,1.597099,...,-5.175987,-7.744311,-7.886584,-7.387905,-6.005219,-4.132215,-10.876108,-6.032805,-11.436200,-5.141315,-5.708246,-6.222179,-6.191239,-4.825258,-5.382321,1.385552,-6.756831,-4.954344,-4.647995,4.954344
3,-5.491313,-4.154698,-7.816013,-5.138604,-7.894239,-2.742113,-5.760341,-4.862461,-5.519333,-6.674704,-7.196736,-6.650473,-8.387108,-8.798591,-7.955631,-9.982560,-7.967961,-10.034129,-4.249448,-4.461387,...,-4.379992,-8.436748,-7.576461,-7.369010,-4.654958,-3.014198,-10.510123,-6.546686,-11.242230,-6.047795,-7.030187,-3.202012,-6.290214,-4.109270,-7.323491,-6.650473,-8.872560,-0.528413,-1.097491,0.528413
4,-6.840902,-4.003818,-5.866246,-6.523896,-5.422292,-4.563298,-5.409802,-6.983063,-6.393577,-6.913778,-7.906884,3.254405,-7.386432,-6.695749,-7.818071,-10.296094,-8.358157,-10.623659,-4.550147,3.208781,...,-6.464602,-7.364207,-7.924477,-7.519702,-6.562814,-5.021197,-11.001737,-6.025055,-11.482601,-5.170908,-5.682560,-4.844880,-7.482646,-4.029594,-5.364463,3.254405,-6.725509,-3.734260,-4.140800,3.734260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,-6.360662,-4.117309,-5.789368,-4.672899,-6.121131,-3.521432,-5.375322,-6.572227,-6.843469,-6.323146,-7.360770,-5.028916,-7.559835,-7.887388,-8.072238,-9.927120,-7.148016,-10.028638,-5.226132,-3.166525,...,-5.702449,-5.154948,-6.797950,-7.103275,-6.076624,-3.412386,-10.407163,-6.607968,-11.364229,-5.743017,-8.228097,-3.676600,-7.654280,-1.541325,-5.981026,-5.028916,-7.470816,3.131638,-0.236089,-3.131638
596,-6.873467,-5.154014,-6.176018,-7.470231,-5.570023,-4.821128,-6.158583,-7.211191,-7.264368,-7.314064,-8.030531,4.670690,-7.341171,-6.192143,-8.010083,-10.169260,-8.360525,-10.521166,-5.803915,4.375664,...,-6.547131,-7.545394,-8.148392,-7.572716,-6.843032,-5.559054,-11.006824,-6.221273,-11.453692,-5.878333,-6.673976,-4.963751,-8.000294,-5.258346,-6.005594,4.670690,-6.199642,-4.741580,-4.165278,4.741580
597,-5.641978,-6.417390,-7.062441,-5.865324,-7.985322,-3.443526,-4.950483,-6.538937,-6.359798,-6.483762,-8.273407,0.372404,-7.507756,-7.151514,-8.542089,-10.180733,-8.384274,-10.248646,-4.898637,0.574487,...,-4.781807,-7.813594,-8.149164,-6.607438,-6.013790,-4.525916,-10.783914,-5.284949,-11.435928,-8.166543,-6.710746,-4.450439,-6.545425,-5.602165,-6.504138,0.372404,-6.669593,-4.852993,-5.246519,4.852993
598,-6.958587,-3.399160,-5.738243,-6.678495,-5.558719,-4.160710,-5.326965,-7.023919,-6.415540,-7.099910,-7.966170,2.559316,-7.511291,-6.808088,-7.874105,-10.235399,-8.272893,-10.635209,-4.879165,2.969906,...,-6.174189,-7.401816,-7.949990,-7.686191,-6.537750,-4.610523,-11.008210,-6.127734,-11.456974,-5.302834,-5.677227,-4.723334,-7.641317,-3.605049,-5.055711,2.559316,-6.806015,-2.952476,-3.839543,2.952476


#### Getting Inverse Transform for Test labels and Predicted labels

In [227]:
y_test_inverse = mlb.inverse_transform(test_binary_matrix)   # This is actually original y_test only.
y_pred_inverse = mlb.inverse_transform(y_pred)

In [228]:
y_test_inverse

[('35', 'Aries', 'Technology', 'male'),
 ('25', 'Aries', 'Internet', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('26', 'Leo', 'indUnk', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('15', 'Libra', 'Student', 'female'),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('24', 'Scorpio', 'female', 'indUnk'),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('33', 'Aquarius', 'InvestmentBanking', 'male'),
 ('24', 'Engineering', 'Libra', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('17', 'Capricorn', 'Sports-Recreation', 'male'),
 ('15', 'Libra', 'Student', 'female'),
 ('24', 'Scorpio', 'female', 'indUnk'),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('33', 'Aquarius', 'InvestmentBanking', 'male'),
 ('24', 'Scorpio', 'female', 'indUnk'),
 ('17', 'Capricorn', 'Sports-Recreation', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('25',

In [229]:
y_pred_inverse

[('35', 'Aries', 'Technology', 'male'),
 ('male',),
 ('35', 'Aries', 'Technology', 'male'),
 ('male',),
 ('35', 'Aries', 'Technology', 'male'),
 ('15', 'Libra', 'Student', 'female'),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('female',),
 ('35', 'Aries', 'Technology', 'male'),
 ('male',),
 ('Aquarius', 'male'),
 ('male',),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('17', 'Student', 'female'),
 ('15', 'Libra', 'Student', 'female'),
 ('15', 'Aquarius', 'Student', 'female'),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('Aquarius', 'male'),
 ('male',),
 ('17', 'Capricorn', 'Sports-Recreation', 'female', 'indUnk'),
 ('Aries', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('35', 'Aries', 'Technology', 'male'),
 ('15', 'Libra', 'Student', 'female'),
 ('female',),
 ('15', 'Aquarius', 'Student', 'female'),
 ('male',),
 ('35', 'Aries', 'Te

In [230]:
X_test

1801                   pink already done sure phoenix tho
1190    woohoo tomorrow probably means need clean plac...
1817    actually johnathan called late last night soun...
251     yesterday driving somewhere mongolia crossed m...
2505    inhale ohjesushchristadogwithacrayonsignaround...
                              ...                        
104     actualy im gona carry writin cos feel even shi...
2087    know chris ben said personal experience type t...
599     much epiphany remmberance epiphanies past like...
1756                 ooooh questionnaires fun wanna wanna
1323    okay coming lunch time havent eaten breakfast ...
Name: text, Length: 600, dtype: object

In [231]:
indices = list(range(len(X_test)))
for i in range(5):
    idx = np.random.choice(indices, 1)[0]
    print('Text',i+1,':', X_test.iloc[idx]) 
    print(f"Actual Label: {','.join(map(str, y_test.iloc[idx]))}") 
    print(f"Predicted Label: {','.join(map(str, y_pred_inverse[idx]))}")
    print('-----------------------------------------------------------------------------------------------------------------')

Text 1 : damn angie rock world limber looks mmmhhhmmm
Actual Label: male,35,Technology,Aries
Predicted Label: 35,Aries,Technology,male
-----------------------------------------------------------------------------------------------------------------
Text 2 : okay may noticed quit smoking today another day friends starting moment stop smoking tobacco yeah
Actual Label: male,35,Technology,Aries
Predicted Label: 35,Aries,Technology,male
-----------------------------------------------------------------------------------------------------------------
Text 3 : urllink perfect neighborhood whole deep blue sea bikini bottom rocks nbsp urllink
Actual Label: female,15,Student,Aquarius
Predicted Label: male
-----------------------------------------------------------------------------------------------------------------
Text 4 : tag sit straight look right look section ta daaaaah located tag board congratulations fixing sorta table links tidying look side boxes
Actual Label: female,24,indUnk,Scorpi

---

### 9. Get
* #### i. Accuracy score
* #### ii. F1 score
* #### iii. Average precision score
* #### iv. Average recall score

In [232]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

print('Accuracy score: ', accuracy_score(test_binary_matrix, y_pred))
print('F1 score: ', f1_score(test_binary_matrix, y_pred, average='micro'))
print('Average precision score: ', average_precision_score(test_binary_matrix, y_pred, average='micro'))
print('Average recall score: ', recall_score(test_binary_matrix, y_pred, average='micro'))

Accuracy score:  0.5233333333333333
F1 score:  0.7215575885526625
Average precision score:  0.5596074546125939
Average recall score:  0.6408333333333334


#### Overall, the classifier is not bad.