In [1]:
import pandas as pd
import numpy as np
import spacy
import nltk
import re

from nltk.tokenize.toktok import ToktokTokenizer

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('blogtext.csv')


In [3]:
df.shape

(681284, 7)

In [4]:
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [5]:
dfs=df.sample(n=int(len(df)/200),random_state=10)

In [6]:
dfs.shape

(3406, 7)

In [7]:
#pip install spacy && python -m spacy download en

In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
#Defining a function to remove special characters.

In [10]:
def remove_special_characters(text, remove_digits=False):
    pattern = '!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Important text !! 123#@!", remove_digits=True)

'Important text  '

In [11]:
#Downloading list of updated stopwords in english

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shakti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
#Defining a function to remove stopwords
tokenizer = ToktokTokenizer()
stopwords = nltk.corpus.stopwords.words('english')
stopwords.remove('no')
stopwords.remove('not')



In [13]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

remove_stopwords("Am i not allowed in ?")

'not allowed ?'

In [14]:
#Defining a function for lemmatisation

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [15]:
#Defining a whole function to perform all above steps

def normalize_corpus(corpus,  text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    
    for doc in corpus:
        # strip HTML
       
        # Bringing text to lower case   
        if text_lower_case:
            doc = doc.lower()
        # removing extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [16]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3406 entries, 262626 to 590789
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      3406 non-null   int64 
 1   gender  3406 non-null   object
 2   age     3406 non-null   int64 
 3   topic   3406 non-null   object
 4   sign    3406 non-null   object
 5   date    3406 non-null   object
 6   text    3406 non-null   object
dtypes: int64(2), object(5)
memory usage: 212.9+ KB


In [17]:
dfs['labels'] = dfs[dfs.columns[1:5]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)

In [18]:
dfs.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,labels
262626,3637189,female,27,Consulting,Aquarius,"14,June,2004",All of this rehashing of the ex-b...,"female,27,Consulting,Aquarius"
673780,3389862,female,36,indUnk,Pisces,"01,June,2004",angelemma2000@hotmail.com (Emma's...,"female,36,indUnk,Pisces"
322119,3295631,female,15,Education,Aquarius,"10,May,2004",On hiatus. Some far off state. A...,"female,15,Education,Aquarius"
52408,2153234,male,27,Religion,Pisces,"23,December,2003",i have been away from internet access for a...,"male,27,Religion,Pisces"
313202,2575612,male,17,Student,Gemini,"03,June,2004",why is my title wayword words? (d...,"male,17,Student,Gemini"


In [19]:
dfs['labels']=dfs.labels.apply(lambda x:x.split(','))

In [20]:
dfs.labels

262626         [female, 27, Consulting, Aquarius]
673780               [female, 36, indUnk, Pisces]
322119          [female, 15, Education, Aquarius]
52408                [male, 27, Religion, Pisces]
313202                [male, 17, Student, Gemini]
                           ...                   
592093               [female, 26, indUnk, Gemini]
19928     [female, 17, BusinessServices, Scorpio]
436066              [female, 16, Student, Cancer]
225558                [female, 17, indUnk, Virgo]
590789                 [female, 25, Arts, Pisces]
Name: labels, Length: 3406, dtype: object

In [21]:
dfs.drop(dfs.columns[1:5],axis=1,inplace=True)

dfs.head()

Unnamed: 0,id,date,text,labels
262626,3637189,"14,June,2004",All of this rehashing of the ex-b...,"[female, 27, Consulting, Aquarius]"
673780,3389862,"01,June,2004",angelemma2000@hotmail.com (Emma's...,"[female, 36, indUnk, Pisces]"
322119,3295631,"10,May,2004",On hiatus. Some far off state. A...,"[female, 15, Education, Aquarius]"
52408,2153234,"23,December,2003",i have been away from internet access for a...,"[male, 27, Religion, Pisces]"
313202,2575612,"03,June,2004",why is my title wayword words? (d...,"[male, 17, Student, Gemini]"


In [22]:
dfs.drop(dfs.columns[:2],1,inplace=True)

dfs.reset_index(drop=True,inplace=True)

dfs.head()

Unnamed: 0,text,labels
0,All of this rehashing of the ex-b...,"[female, 27, Consulting, Aquarius]"
1,angelemma2000@hotmail.com (Emma's...,"[female, 36, indUnk, Pisces]"
2,On hiatus. Some far off state. A...,"[female, 15, Education, Aquarius]"
3,i have been away from internet access for a...,"[male, 27, Religion, Pisces]"
4,why is my title wayword words? (d...,"[male, 17, Student, Gemini]"


In [23]:
dfs['cleaned_text']=normalize_corpus(dfs['text'])

dfs.head()

Unnamed: 0,text,labels,cleaned_text
0,All of this rehashing of the ex-b...,"[female, 27, Consulting, Aquarius]",rehashing ex boyfriend year open psychological...
1,angelemma2000@hotmail.com (Emma's...,"[female, 36, indUnk, Pisces]",angelemmahotmail com emma e mail address ill_s...
2,On hiatus. Some far off state. A...,"[female, 15, Education, Aquarius]",hiatus far state also know illinois family reu...
3,i have been away from internet access for a...,"[male, 27, Religion, Pisces]",I away internet access judge comment either no...
4,why is my title wayword words? (d...,"[male, 17, Student, Gemini]",title wayword word deep thought day well tonig...


In [24]:
dfs.drop('text',1,inplace=True)


In [25]:
dfs=dfs[['cleaned_text','labels']]
dfs.head()

Unnamed: 0,cleaned_text,labels
0,rehashing ex boyfriend year open psychological...,"[female, 27, Consulting, Aquarius]"
1,angelemmahotmail com emma e mail address ill_s...,"[female, 36, indUnk, Pisces]"
2,hiatus far state also know illinois family reu...,"[female, 15, Education, Aquarius]"
3,I away internet access judge comment either no...,"[male, 27, Religion, Pisces]"
4,title wayword word deep thought day well tonig...,"[male, 17, Student, Gemini]"


In [26]:
dfs.isnull().sum()

cleaned_text    0
labels          0
dtype: int64

In [26]:
X=dfs['cleaned_text']
y=dfs['labels']

In [27]:
from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest=train_test_split(X,y,random_state=10,test_size=0.3)

In [55]:
xtrain.isnull().sum()

0

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

CV=CountVectorizer(ngram_range=(1,2),min_df=2)

In [29]:
CV.fit(xtrain)

CountVectorizer(min_df=2, ngram_range=(1, 2))

In [30]:
data=CV.transform(xtrain)

In [32]:
data.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [31]:
DTM_train=pd.DataFrame(data.toarray(),columns=CV.get_feature_names())
DTM_train.head()

#Document term matrix

Unnamed: 0,__,aa,aa meeting,aahh,aahh sigh,aargh,aaron,aaron call,ab,abandon,...,zillion,zion,zip,zit,zodiac,zoe,zombie,zone,zoo,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
data=CV.transform(xtest)
DTM_test=pd.DataFrame(data.toarray(),columns=CV.get_feature_names())
DTM_test.head()

Unnamed: 0,__,aa,aa meeting,aahh,aahh sigh,aargh,aaron,aaron call,ab,abandon,...,zillion,zion,zip,zit,zodiac,zoe,zombie,zone,zoo,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
y.head()

0    [female, 27, Consulting, Aquarius]
1          [female, 36, indUnk, Pisces]
2     [female, 15, Education, Aquarius]
3          [male, 27, Religion, Pisces]
4           [male, 17, Student, Gemini]
Name: labels, dtype: object

In [33]:
ed=dict()
for i in y:
    
    for k in i:
        if k not in ed.keys():
            ed[k]=1
        else:
            ed[k]=ed[k]+1


ed

{'female': 1666,
 '27': 238,
 'Consulting': 39,
 'Aquarius': 265,
 '36': 68,
 'indUnk': 1276,
 'Pisces': 251,
 '15': 208,
 'Education': 145,
 'male': 1740,
 'Religion': 29,
 '17': 389,
 'Student': 762,
 'Gemini': 249,
 'Aries': 327,
 '24': 421,
 'Scorpio': 260,
 'Virgo': 279,
 '41': 17,
 'Leo': 317,
 '14': 139,
 'Cancer': 367,
 '33': 91,
 'Capricorn': 250,
 'Arts': 157,
 'Fashion': 26,
 '16': 351,
 'Technology': 209,
 'Libra': 288,
 '34': 107,
 '25': 314,
 'Communications-Media': 119,
 'Biotech': 13,
 'Taurus': 309,
 '38': 37,
 'Law': 33,
 'Sagittarius': 244,
 'Maritime': 3,
 '23': 322,
 '26': 302,
 '44': 20,
 'LawEnforcement-Security': 13,
 '13': 82,
 '43': 23,
 'Non-Profit': 66,
 'Internet': 69,
 'Military': 16,
 'Accounting': 27,
 'Engineering': 52,
 '47': 13,
 'BusinessServices': 30,
 '35': 79,
 'Telecommunications': 22,
 'Chemicals': 24,
 '39': 27,
 '46': 26,
 'Transportation': 15,
 'Publishing': 44,
 'Government': 30,
 '45': 15,
 'Banking': 17,
 'Environment': 5,
 'InvestmentBank

In [64]:
len(ed.keys())  #Total subcategories present in the labels.

80

In [65]:
y.head()

0    [female, 27, Consulting, Aquarius]
1          [female, 36, indUnk, Pisces]
2     [female, 15, Education, Aquarius]
3          [male, 27, Religion, Pisces]
4           [male, 17, Student, Gemini]
Name: labels, dtype: object

In [34]:
from sklearn.preprocessing import MultiLabelBinarizer

MB=MultiLabelBinarizer()

In [35]:
MB.fit(ytrain)
ytrain_en=MB.transform(ytrain)

ytest_en=MB.transform(ytest)

In [36]:
ytrain_en=pd.DataFrame(ytrain_en,columns=MB.classes_)
ytest_en=pd.DataFrame(ytest_en,columns=MB.classes_)

In [69]:
ytrain_en.head()

Unnamed: 0,13,14,15,16,17,23,24,25,26,27,...,Student,Taurus,Technology,Telecommunications,Tourism,Transportation,Virgo,female,indUnk,male
0,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [37]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

clf=LogisticRegression(solver='lbfgs')
clf=OneVsRestClassifier(clf)

In [38]:
clf.fit(DTM_train,ytrain_en)

OneVsRestClassifier(estimator=LogisticRegression())

In [39]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,f1_score,recall_score,average_precision_score

In [40]:
y_pred=clf.predict(DTM_test)

In [93]:
#y_pred=pd.DataFrame(y_pred,columns=MB.classes_)
#y_pred.head()

Unnamed: 0,13,14,15,16,17,23,24,25,26,27,...,Student,Taurus,Technology,Telecommunications,Tourism,Transportation,Virgo,female,indUnk,male
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [104]:
#y_pred_or=MB.inverse_transform(y_pred)
#y_pred_or

In [41]:
report=classification_report(ytest_en,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        26
           1       0.00      0.00      0.00        46
           2       0.00      0.00      0.00        75
           3       0.17      0.04      0.07        98
           4       0.19      0.07      0.11       107
           5       0.07      0.01      0.02        93
           6       0.13      0.03      0.05       116
           7       0.08      0.01      0.02       101
           8       0.06      0.01      0.02        87
           9       0.00      0.00      0.00        86
          10       0.00      0.00      0.00        35
          11       0.00      0.00      0.00        30
          12       0.00      0.00      0.00        24
          13       0.00      0.00      0.00        22
          14       0.00      0.00      0.00        13
          15       0.00      0.00      0.00        15
          16       0.00      0.00      0.00         5
          17       0.00    

In [42]:
#Defining a function to print micro measures

def micro(Ytest, Ypred):
    print('Accuracy: ', accuracy_score(Ytest, Ypred))
    print('F1 score: Micro', f1_score(Ytest, Ypred, average='micro'))
    print('Average precision score: Micro', average_precision_score(Ytest, Ypred, average='micro'))
    print('Average recall score: Micro', recall_score(Ytest, Ypred, average='micro'))

In [43]:
#Defining a function to print macro measures

def macro(Ytest, Ypred):
    print('Accuracy: ', accuracy_score(Ytest, Ypred))
    print('F1 score: Macro', f1_score(Ytest, Ypred, average='macro'))
    print('Average recall score: Macro', recall_score(Ytest, Ypred, average='macro'))
    

In [44]:
# Definign a function for weigted measures

def weighted(Ytest, Ypred):
    print('Accuracy: ', accuracy_score(Ytest, Ypred))
    print('F1 score: weighted', f1_score(Ytest, Ypred, average='weighted'))
    print('Average precision score: weighted', average_precision_score(Ytest, Ypred, average='weighted'))
    print('Average recall score: weighted', recall_score(Ytest, Ypred, average='weighted'))

In [45]:
#Printing measures

micro(ytest_en,y_pred)

Accuracy:  0.0
F1 score: Micro 0.2553629655054059
Average precision score: Micro 0.11876396111313045
Average recall score: Micro 0.18199608610567514


In [46]:
macro(ytest_en,y_pred)

Accuracy:  0.0
F1 score: Macro 0.02669380987631249
Average recall score: Macro 0.023024592221536477


In [47]:
weighted(ytest_en,y_pred)

Accuracy:  0.0
F1 score: weighted 0.19356212509865667
Average precision score: weighted 0.22906192136432413
Average recall score: weighted 0.18199608610567514


In [48]:


import random 

def print5(y_pred,ytest_en):
    ran= []
    for i in range(5):
        ran.append(random.randint(0, len(ytest_en)))
    print('Random numbers selected are given below')
    print(ran)
    print()
                 
    for k in ran:
        print(MB.inverse_transform(y_pred)[k])
        print(MB.inverse_transform(np.array(ytest_en))[k])
        print()
        print()

In [125]:
ytest_en.head()

Unnamed: 0,13,14,15,16,17,23,24,25,26,27,...,Student,Taurus,Technology,Telecommunications,Tourism,Transportation,Virgo,female,indUnk,male
0,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,1,0,0


In [49]:
print5(y_pred,ytest_en)

Random numbers selected are given below
[737, 890, 836, 898, 782]

('24', 'indUnk', 'male')
('27', 'Cancer', 'indUnk', 'male')


('Student', 'Virgo', 'female')
('27', 'Aries', 'female', 'indUnk')


('Cancer', 'Student', 'male')
('25', 'Gemini', 'indUnk', 'male')


('Student', 'female')
('16', 'Technology', 'Virgo', 'male')


('indUnk', 'male')
('25', 'Sagittarius', 'female', 'indUnk')


