In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df=pd.read_csv('blogtext.csv')
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [3]:
# shape
df.shape

(681284, 7)

In [4]:
# dtypes
df.dtypes

id         int64
gender    object
age        int64
topic     object
sign      object
date      object
text      object
dtype: object

In [5]:
# null values
df.isnull().sum()

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681284 entries, 0 to 681283
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   id      681284 non-null  int64 
 1   gender  681284 non-null  object
 2   age     681284 non-null  int64 
 3   topic   681284 non-null  object
 4   sign    681284 non-null  object
 5   date    681284 non-null  object
 6   text    681284 non-null  object
dtypes: int64(2), object(5)
memory usage: 36.4+ MB


In [7]:
df.describe()

Unnamed: 0,id,age
count,681284.0,681284.0
mean,2397802.0,23.932326
std,1247723.0,7.786009
min,5114.0,13.0
25%,1239610.0,17.0
50%,2607577.0,24.0
75%,3525660.0,26.0
max,4337650.0,48.0


In [8]:
df[df.duplicated()]

Unnamed: 0,id,gender,age,topic,sign,date,text
2310,589736,male,35,Technology,Aries,"05,August,2004",hey guys - i had the flu today - th...
3469,589736,male,35,Technology,Aries,"05,August,2004",
3578,589736,male,35,Technology,Aries,"05,August,2004",
3626,589736,male,35,Technology,Aries,"05,August,2004",
3627,589736,male,35,Technology,Aries,"05,August,2004",
...,...,...,...,...,...,...,...
679550,3446325,male,24,Advertising,Gemini,"25,May,2004","Ok, this is very cool. AISO GrepLaw, De..."
679551,3446325,male,24,Advertising,Gemini,"25,May,2004","Ok, this is very cool. AISO GrepLaw, De..."
679552,3446325,male,24,Advertising,Gemini,"25,May,2004","Ok, this is very cool. AISO GrepLaw, De..."
679553,3446325,male,24,Advertising,Gemini,"25,May,2004","Ok, this is very cool. AISO GrepLaw, De..."


In [9]:
# removing the duplicates
df.drop_duplicates(inplace=True)

In [10]:
df.shape

(676598, 7)

### 1. Load the dataset (5 points)
##### a. Tip: As the dataset is large, use fewer rows. Check what is working well on your machine and decide accordingly.


In [11]:
df1=df.sample(frac=0.20,random_state=42)
df1.shape

(135320, 7)

In [12]:
pd.set_option('display.max_rows',None)
df1.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
255678,1772041,male,14,Student,Sagittarius,"04,August,2004",ahhh hayo.. i dunt believe we are...
606383,3353160,female,14,Arts,Leo,"26,June,2004",For the first time since the end of sch...
40902,3798944,male,23,Banking,Capricorn,"01,July,2004",Wow... today has been HECTIC to say...
626648,1944324,male,16,Student,Taurus,"08,November,2003",Celebrating 10 posts of Jonah! (Hosted ...
71623,479019,male,24,Student,Gemini,"02,July,2004",Man... I've been having some majo...


### 2. Preprocess rows of the “text” column (7.5 points)

### a. Remove unwanted characters

In [13]:
df1['new_text']=df1.text.str.replace('[^a-zA-Z0-9]+',' ')

### b. Convert text to lowercase

In [14]:
df1['new_text']=df1.new_text.str.lower()

### c. Remove unwanted spaces

In [15]:
df1['new_text']=list(map(lambda x:x.strip(),df1['new_text']))

In [16]:
df1.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,new_text
255678,1772041,male,14,Student,Sagittarius,"04,August,2004",ahhh hayo.. i dunt believe we are...,ahhh hayo i dunt believe we are in yr 11 caraz...
606383,3353160,female,14,Arts,Leo,"26,June,2004",For the first time since the end of sch...,for the first time since the end of school i m...
40902,3798944,male,23,Banking,Capricorn,"01,July,2004",Wow... today has been HECTIC to say...,wow today has been hectic to say the least we ...
626648,1944324,male,16,Student,Taurus,"08,November,2003",Celebrating 10 posts of Jonah! (Hosted ...,celebrating 10 posts of jonah hosted by lauren...
71623,479019,male,24,Student,Gemini,"02,July,2004",Man... I've been having some majo...,man i ve been having some majorly trippy dream...


### d. Remove stopwords

In [17]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
df1['new_text']=df1['new_text'].apply(lambda x: ' '.join([i for i in x.split() if i not in stop_words]))

[nltk_data] Downloading package stopwords to C:\Users\akash
[nltk_data]     inc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [18]:
print(df1.text.iloc[1],df1.new_text.iloc[1],sep='\n')

       For the first time since the end of school I met up with Brandon and Sean to PK at Vincent Elementary, and DAMN WAS IT AWESOME!!  I got there about 20 minutes before everyone else and pk'd some, but really not a lot seeing as I feel tired.  AANYWAY.  I drilled monkey vaults and attempted some kongs, which I still freak out on and dont put my legs through my arms.  Oh well.  Um, we talked a lot and ate food some.  Then when Brandon got there we started going around the playground a little bit.  Much to my surprise, I found that rolling on woodchips is not comfortable after a 10 foot drop, they feel like little rocks digging into your spine.  So I went back to vaulting this little tube thing, that was fun. Then Sean came and we practiced wall-runs, stretched, and did some flow activities.  I need to relax for a little bit, but not moving for a day is rough.  I'm addicted!!!    
first time since end school met brandon sean pk vincent elementary damn awesome got 20 minutes everyone 

## 3. As we want to make this into a multi-label classification problem, you are required to merge all the label columns together, so that we have all the labels together for a particular sentence 



### a. Label columns to merge: “gender”, “age”, “topic”, “sign”

In [19]:
df1['Label']=df1.apply(lambda x:[x['gender'],str(x['age']),x['topic'],x['sign']],axis=1)
df1.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,new_text,Label
255678,1772041,male,14,Student,Sagittarius,"04,August,2004",ahhh hayo.. i dunt believe we are...,ahhh hayo dunt believe yr 11 carazy yes well w...,"[male, 14, Student, Sagittarius]"
606383,3353160,female,14,Arts,Leo,"26,June,2004",For the first time since the end of sch...,first time since end school met brandon sean p...,"[female, 14, Arts, Leo]"
40902,3798944,male,23,Banking,Capricorn,"01,July,2004",Wow... today has been HECTIC to say...,wow today hectic say least busy year hope sign...,"[male, 23, Banking, Capricorn]"
626648,1944324,male,16,Student,Taurus,"08,November,2003",Celebrating 10 posts of Jonah! (Hosted ...,celebrating 10 posts jonah hosted laurence fis...,"[male, 16, Student, Taurus]"
71623,479019,male,24,Student,Gemini,"02,July,2004",Man... I've been having some majo...,man majorly trippy dreams past week two last n...,"[male, 24, Student, Gemini]"


### b. After completing the previous step, there should be only two columns in your data frame i.e. “text” and “labels” as shown in the below image

In [20]:
df2=df1[['new_text','Label']]
df2.head()

Unnamed: 0,new_text,Label
255678,ahhh hayo dunt believe yr 11 carazy yes well w...,"[male, 14, Student, Sagittarius]"
606383,first time since end school met brandon sean p...,"[female, 14, Arts, Leo]"
40902,wow today hectic say least busy year hope sign...,"[male, 23, Banking, Capricorn]"
626648,celebrating 10 posts jonah hosted laurence fis...,"[male, 16, Student, Taurus]"
71623,man majorly trippy dreams past week two last n...,"[male, 24, Student, Gemini]"


## 4. Separate features and labels, and split the data into training and testing (5 points)


In [21]:
from sklearn.model_selection import train_test_split
x=df2['new_text']
y=df2['Label']
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.30,random_state=0)

## 5. Vectorize the features (5 points)


### a. Create a Bag of Words using count vectorizer

### i. Use ngram_range=(1, 2)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(ngram_range=(1,2),binary=True)

### ii. Vectorize training and testing features


In [23]:
xtrain=cv.fit_transform(xtrain)
xtest=cv.transform(xtest)

### b. Print the term-document matrix

In [24]:
cv.get_feature_names()

['00',
 '00 00',
 '00 000',
 '00 00pm',
 '00 01',
 '00 02',
 '00 03',
 '00 04',
 '00 05',
 '00 06',
 '00 08',
 '00 09',
 '00 10',
 '00 10pm',
 '00 11',
 '00 12',
 '00 13',
 '00 14',
 '00 15',
 '00 16',
 '00 17',
 '00 18th',
 '00 19',
 '00 1900',
 '00 20',
 '00 2004',
 '00 2010',
 '00 21',
 '00 22',
 '00 23',
 '00 24',
 '00 25',
 '00 26',
 '00 27',
 '00 28',
 '00 29',
 '00 2nd',
 '00 30',
 '00 300m',
 '00 30am',
 '00 30ish',
 '00 31',
 '00 31st',
 '00 32',
 '00 33',
 '00 34',
 '00 35',
 '00 36',
 '00 40',
 '00 400',
 '00 41',
 '00 42',
 '00 44',
 '00 445',
 '00 45',
 '00 45pm',
 '00 46',
 '00 47',
 '00 48',
 '00 49',
 '00 50',
 '00 52',
 '00 53',
 '00 54',
 '00 55',
 '00 56',
 '00 57',
 '00 59',
 '00 60',
 '00 6pm',
 '00 7lb',
 '00 aaaaaaaaaggghhhhhh',
 '00 able',
 '00 abolishing',
 '00 accident',
 '00 accompanied',
 '00 according',
 '00 account',
 '00 ache',
 '00 ack',
 '00 activation',
 '00 actual',
 '00 actually',
 '00 administrator',
 '00 admission',
 '00 advance',
 '00 afternoon',


### 6. Create a dictionary to get the count of every label i.e. the key will be label name and value will be the total count of the label. Check below image for reference (5 points)


In [25]:
# word=cv.get_feature_names()
labels={}
for i in y.values:
    for j in i:
        if j in labels:
            labels[j]+=1
        else:
            labels[j]=1
print(labels)

{'male': 68339, '14': 5470, 'Student': 30859, 'Sagittarius': 9892, 'female': 66981, 'Arts': 6470, 'Leo': 10663, '23': 14462, 'Banking': 839, 'Capricorn': 9790, '16': 14662, 'Taurus': 12249, '24': 15967, 'Gemini': 10071, '40': 971, 'Accounting': 743, 'Scorpio': 11423, '17': 15940, '27': 9252, 'Law': 1828, 'Aries': 13029, '34': 4277, 'Religion': 1038, 'indUnk': 49607, 'Communications-Media': 4074, '33': 3486, 'Libra': 12302, '15': 8329, 'Pisces': 10665, 'Technology': 8228, 'Cancer': 12936, '13': 2446, '25': 13345, '26': 10997, 'Virgo': 12243, '47': 423, 'Internet': 3166, 'Marketing': 938, 'Education': 5973, 'Museums-Libraries': 627, 'Engineering': 2236, '35': 3384, 'Telecommunications': 760, 'Architecture': 299, 'Non-Profit': 2964, 'Aquarius': 10057, '38': 1454, 'HumanResources': 604, 'Publishing': 1530, 'Chemicals': 757, 'Agriculture': 260, 'Science': 1458, '39': 1074, '46': 503, 'Biotech': 429, '37': 1895, '45': 911, 'Government': 1377, 'BusinessServices': 877, 'Automotive': 248, '41':

## 7. Transform the labels - (7.5 points)
### As we have noticed before, in this task each example can have multiple tags. To deal with such kind of prediction, we need to transform labels in a binary form and the prediction will be a mask of 0s and 1s. For this purpose, it is convenient to use MultiLabelBinarizer from sklearn

### a. Convert your train and test labels using MultiLabelBinarizer


In [26]:
ytrain

197792                           [female, 23, Arts, Cancer]
130032                      [male, 16, indUnk, Sagittarius]
547252                          [female, 37, indUnk, Libra]
513708                    [female, 25, indUnk, Sagittarius]
524120                          [male, 25, Student, Cancer]
116145                                [male, 35, Arts, Leo]
210120                          [male, 16, Student, Pisces]
196019                       [male, 17, Education, Scorpio]
238985                            [male, 34, indUnk, Aries]
296784                          [male, 23, indUnk, Scorpio]
139334                      [female, 15, Arts, Sagittarius]
27425                          [female, 23, Science, Libra]
26082                          [female, 23, Arts, Aquarius]
654871                          [male, 24, indUnk, Scorpio]
184971                [male, 25, Telecommunications, Libra]
195779                       [female, 15, indUnk, Aquarius]
527062                    [male, 35, Tec

In [27]:
ytest

529046                         [female, 16, Student, Aries]
556315                            [male, 15, indUnk, Virgo]
375360               [male, 34, Telecommunications, Taurus]
374872                         [female, 24, Student, Aries]
489972                         [female, 16, indUnk, Cancer]
157380                       [male, 17, Education, Scorpio]
667885                        [female, 16, Student, Gemini]
506693                            [male, 26, indUnk, Aries]
633116                      [female, 25, Non-Profit, Virgo]
196884                         [female, 16, indUnk, Gemini]
599026                           [female, 14, Student, Leo]
377900                        [female, 17, Student, Pisces]
293647                     [male, 13, Student, Sagittarius]
562328                         [female, 17, Student, Libra]
445855                         [female, 23, indUnk, Gemini]
201847                        [female, 26, indUnk, Scorpio]
476573                       [male, 16, 

In [28]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb=MultiLabelBinarizer()
ytrain=mlb.fit_transform(ytrain.values)
ytest=mlb.transform(ytest.values)

In [29]:
ytrain

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 1, 0]])

In [30]:
ytest

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 1, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

## 8. Choose a classifier - (5 points)
### In this task, we suggest using the One-vs-Rest approach, which is implemented in OneVsRestClassifier class. In this approach k classifiers (= number of tags) are trained. As a basic classifier, use  LogisticRegression. It is one of the simplest methods, but often it performs good enough in text  classification tasks. It might take some time because the number of classifiers to train is large.


### a. Use a linear classifier of your choice, wrap it up in OneVsRestClassifier to train it on every label
### b. As One-vs-Rest approach might not have been discussed in the sessions, we are providing you with the code for that

In [31]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(solver='lbfgs')
clf=OneVsRestClassifier(clf)
clf.fit(xtrain,ytrain)

OneVsRestClassifier(estimator=LogisticRegression())

## 9. Fit the classifier, make predictions and get the accuracy (5 points)


### a. Print the following
### i. Accuracy score
### ii. F1 score


In [32]:
from sklearn.metrics import accuracy_score,f1_score,average_precision_score,recall_score,classification_report
ypred=clf.predict(xtest)
accuracy_score(ytest,ypred)

0.013769829539856143

In [34]:
from sklearn.metrics import accuracy_score,f1_score,average_precision_score,recall_score,classification_report
y_pred=clf.predict(xtest)
print('Accuracy score: ', accuracy_score(ytest, y_pred))
print('F1 score: ', f1_score(ytest, y_pred,average='micro'))
print('Average precision score: ', average_precision_score(ytest, y_pred,average='micro'))
print('Average recall score: ', recall_score(ytest, y_pred,average='micro'))

Accuracy score:  0.013769829539856143
F1 score:  0.341281116807709
Average precision score:  0.1814301035915732
Average recall score:  0.23817001675041877


In [37]:
print(classification_report(ytest,y_pred))

              precision    recall  f1-score   support

           0       0.37      0.02      0.05       683
           1       0.56      0.10      0.17      1676
           2       0.42      0.07      0.12      2516
           3       0.53      0.14      0.22      4372
           4       0.52      0.14      0.22      4706
           5       0.40      0.05      0.09      4359
           6       0.43      0.06      0.10      4754
           7       0.40      0.05      0.09      4016
           8       0.43      0.04      0.07      3312
           9       0.37      0.04      0.07      2773
          10       0.47      0.02      0.03      1060
          11       0.74      0.10      0.17      1274
          12       0.39      0.03      0.06      1033
          13       0.63      0.05      0.09       857
          14       0.73      0.05      0.10       599
          15       0.56      0.01      0.02       467
          16       0.50      0.01      0.02       360
          17       0.79    

## 10. Print true label and predicted label for any five examples (7.5 points)


In [39]:
y_test_inversed = mlb.inverse_transform(ytest)
y_pred_inversed = mlb.inverse_transform(y_pred)

In [40]:
for i in range(0,5):
    print('Text:\t{}\nActual Labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        xtest[i],
        ','.join(y_test_inversed[i]),
        ','.join(y_pred_inversed[i])
    ))

Text:	  (0, 1094457)	1
  (0, 2307355)	1
Actual Labels:	16,Aries,Student,female
Predicted labels:	male


Text:	  (0, 23620)	1
  (0, 48715)	1
  (0, 71767)	1
  (0, 290007)	1
  (0, 290736)	1
  (0, 295042)	1
  (0, 295281)	1
  (0, 326218)	1
  (0, 399072)	1
  (0, 400709)	1
  (0, 410144)	1
  (0, 410892)	1
  (0, 534505)	1
  (0, 883070)	1
  (0, 883198)	1
  (0, 898825)	1
  (0, 930350)	1
  (0, 931110)	1
  (0, 931227)	1
  (0, 995301)	1
  (0, 997923)	1
  (0, 1141361)	1
  (0, 1311583)	1
  (0, 1312845)	1
  (0, 1809049)	1
  :	:
  (0, 2287410)	1
  (0, 2303800)	1
  (0, 2303802)	1
  (0, 2350417)	1
  (0, 2481848)	1
  (0, 2482062)	1
  (0, 2688261)	1
  (0, 2710143)	1
  (0, 2713905)	1
  (0, 2736829)	1
  (0, 2737495)	1
  (0, 2856999)	1
  (0, 2857008)	1
  (0, 2857168)	1
  (0, 3489095)	1
  (0, 3489317)	1
  (0, 3507055)	1
  (0, 3835204)	1
  (0, 3891443)	1
  (0, 3895543)	1
  (0, 3939054)	1
  (0, 3939247)	1
  (0, 4734231)	1
  (0, 4734281)	1
  (0, 4761935)	1
Actual Labels:	15,Virgo,indUnk,male
Predicted labels:	male