In [1]:
# Import files
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.cluster import KMeans
from scipy.stats import zscore
from sklearn import metrics
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering 
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.spatial.distance import pdist
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
# run this cell to to mount the google drive if you are using google colab
from google.colab import drive
drive.mount('/content/drive')
project_path = '/content/drive/My Drive/assignments/'

Mounted at /content/drive


### Load Dataset

In [3]:
#Loading data
blogs = pd.read_csv(project_path + "blogtext.csv")

In [4]:
blogs.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [5]:
blogs.shape

(681284, 7)

There are 681284 rows and 7 columns - id, gender, age, topic, sign, date, text

In [6]:
# Taking 5000 records as more records are causing issues in analysis and the page crashes
blogs = blogs.iloc[:5000]

###  Preprocess rows of the “text” column

In [7]:
# Import the nltk library
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> all
    Downloading collection 'all'
       | 
       | Downloading package abc to /root/nltk_data...
       |   Unzipping corpora/abc.zip.
       | Downloading package alpino to /root/nltk_data...
       |   Unzipping corpora/alpino.zip.
       | Downloading package biocreative_ppi to /root/nltk_data...
       |   Unzipping corpora/biocreative_ppi.zip.
       | Downloading package brown to /root/nltk_data...
       |   Unzipping corpora/brown.zip.
       | Downloading package brown_tei to /root/nltk_data...
       |   Unzipping corpora/brown_tei.zip.
       | Downloading package cess_cat to /root/nltk_data...
       |   Unzipping corpora/cess_cat.zip.
       | Downloading package

True

In [8]:
import re # Import regex
from nltk.corpus import stopwords # Import stopwords
from nltk.tokenize import word_tokenize  # Import wordtokenizer

In [9]:
# English stopwords only
stop_words = set(stopwords.words('english'))

In [10]:
# Function to clean the text
def clean_str(text): 
    # 1. Using regex remove unwanted characters
    pattern = r'[^a-zA-z\s]'
    text = re.sub(pattern, '', str(text))
    # 2. Convert to lower text 
    text = text.lower()
    # 3. Tokenize words and then remove stop words
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    # join the tokens with space as joining point to form the sentence
    finalText = ' '.join(filtered_sentence)
    # return the final text
    return finalText

In [11]:
# Apply the clean text function to the text column and store the same in the clean_text column
blogs['clean_text'] = blogs['text'].apply(clean_str)

In [12]:
# Inspect the formatted dataframe.
blogs.head()

Unnamed: 0,id,gender,age,topic,sign,date,text,clean_text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,...",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...,team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...,het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!,testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...,thanks yahoos toolbar capture urls popupswhich...


We can see that the clean_test column contains the cleaned up data

In [13]:
# drop the text column
blogs.drop('text', axis=1, inplace=True)

In [15]:
# Final blogs
blogs.head()

Unnamed: 0,id,gender,age,topic,sign,date,clean_text
0,2059027,male,15,Student,Leo,"14,May,2004",info found pages mb pdf files wait untill team...
1,2059027,male,15,Student,Leo,"13,May,2004",team members drewes van der laag urllink mail ...
2,2059027,male,15,Student,Leo,"12,May,2004",het kader van kernfusie op aarde maak je eigen...
3,2059027,male,15,Student,Leo,"12,May,2004",testing testing
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",thanks yahoos toolbar capture urls popupswhich...


This is the cleaned up data set

### Merge all the label columns together, so that we have all the labels together for a particular sentence

In [67]:
# Loop through all the rows and create the data frame out of the table.
data = []
for index in range(blogs.shape[0]):
  # Build the label for each row comprising of gender, age, topic and sign
  labels_val = [blogs['gender'][index], str(blogs['age'][index]), blogs['topic'][index], blogs['sign'][index]]
  text_val = blogs['clean_text'][index]
  data.append({'labels': labels_val, 'text': text_val})

In [68]:
# Each text abd labels
data

[{'labels': ['male', '15', 'Student', 'Leo'],
  'text': 'info found pages mb pdf files wait untill team leader processed learns html'},
 {'labels': ['male', '15', 'Student', 'Leo'],
  'text': 'team members drewes van der laag urllink mail ruiyu xie urllink mail bryan aaldering urllink mail'},
 {'labels': ['male', '15', 'Student', 'Leo'],
  'text': 'het kader van kernfusie op aarde maak je eigen waterstofbom build hbomb ascotttartarusuwaeduau andrew scott newsgroups rechumor subject build hbomb humorous date feb gmt organization university western australia original file dated th november seemed transcript seven days article poorly formatted corrupted added text examine microscope malleable like gold missing anyone full text please distribute responsible accuracy information converted html dionisioinfinetcom little spellchecking minor edits stolen urllink httpmyohiovoyagernetdionisiofunmownhbombhtml reformatted html validates xhtml strict build hbomb making owning hbomb kind challenge r

In [17]:
# Build the final blogs
final_blogs = pd.DataFrame(data)

In [18]:
# Check the final blogs dataframe
final_blogs.head()

Unnamed: 0,labels,text
0,"[male, 15, Student, Leo]",info found pages mb pdf files wait untill team...
1,"[male, 15, Student, Leo]",team members drewes van der laag urllink mail ...
2,"[male, 15, Student, Leo]",het kader van kernfusie op aarde maak je eigen...
3,"[male, 15, Student, Leo]",testing testing
4,"[male, 33, InvestmentBanking, Aquarius]",thanks yahoos toolbar capture urls popupswhich...


We now have 2 columns lables and the text (feedback). 

### Separate features and labels, and split the data into training and testing

In [19]:
trainX, testX, trainY, testY = train_test_split(final_blogs['text'], final_blogs['labels'], random_state=2)

### Vectorize the features

In [20]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer

Build Document-term Matrix (DTM)

In [21]:
cvect = CountVectorizer(ngram_range=(1,2))

In [22]:
cvect.fit(trainX)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [23]:
trainX_cv = cvect.transform(trainX)
testX_cv = cvect.transform(testX)

 Print document term matrix

In [73]:
print(trainX_cv)

  (0, 7745)	1
  (0, 7748)	1
  (0, 10134)	1
  (0, 10183)	1
  (0, 10217)	1
  (0, 10221)	1
  (0, 12354)	1
  (0, 12357)	1
  (0, 20891)	1
  (0, 20969)	1
  (0, 29795)	2
  (0, 29832)	1
  (0, 29982)	1
  (0, 40763)	2
  (0, 40769)	1
  (0, 40770)	1
  (0, 41828)	1
  (0, 41862)	1
  (0, 47177)	1
  (0, 55228)	1
  (0, 55229)	1
  (0, 59089)	1
  (0, 59175)	1
  (0, 68285)	2
  (0, 68423)	1
  :	:
  (3749, 99014)	1
  (3749, 99019)	1
  (3749, 99768)	2
  (3749, 99778)	1
  (3749, 99791)	1
  (3749, 107920)	1
  (3749, 108214)	1
  (3749, 119047)	1
  (3749, 119062)	1
  (3749, 131519)	1
  (3749, 131561)	1
  (3749, 136609)	1
  (3749, 136630)	1
  (3749, 188332)	1
  (3749, 188528)	1
  (3749, 213440)	1
  (3749, 213606)	1
  (3749, 217976)	1
  (3749, 218051)	1
  (3749, 222466)	1
  (3749, 222468)	1
  (3749, 224000)	1
  (3749, 224043)	1
  (3749, 225248)	1
  (3749, 225327)	1


In [25]:
#Size of Document Term Matrix
trainX_cv.shape

(3750, 261279)

In [26]:
testX_cv.shape

(1250, 261279)

 ### Create a dictionary to get the count of every label 

In [27]:
dictionary = {}
dictToMatch = {}
for label in final_blogs.labels:
  if str(label[0]) + str(label[1]) + str(label[2]) + str(label[3]) in dictToMatch.keys():
    pass
  else:
    for key in label:
      if key in dictionary.keys():
          dictionary[key] = dictionary[key] + 1
      else: 
          dictionary[key] = 1
    dictToMatch[str(label[0]) + str(label[1]) + str(label[2]) + str(label[3])] = "Y"

In [28]:
len(dictionary.keys())

54

In [29]:
dictionary

{'14': 9,
 '15': 12,
 '16': 6,
 '17': 9,
 '23': 10,
 '24': 7,
 '25': 14,
 '26': 6,
 '27': 2,
 '33': 6,
 '34': 3,
 '35': 2,
 '36': 2,
 '37': 1,
 '39': 2,
 '41': 1,
 '42': 1,
 '44': 1,
 '45': 1,
 '46': 1,
 'Accounting': 1,
 'Aquarius': 11,
 'Aries': 10,
 'Arts': 3,
 'Automotive': 1,
 'Banking': 2,
 'BusinessServices': 5,
 'Cancer': 5,
 'Capricorn': 5,
 'Communications-Media': 2,
 'Consulting': 2,
 'Education': 3,
 'Engineering': 1,
 'Gemini': 8,
 'Internet': 1,
 'InvestmentBanking': 1,
 'Law': 1,
 'Leo': 13,
 'Libra': 10,
 'Museums-Libraries': 1,
 'Non-Profit': 2,
 'Pisces': 3,
 'Religion': 1,
 'Sagittarius': 14,
 'Science': 2,
 'Scorpio': 6,
 'Sports-Recreation': 1,
 'Student': 24,
 'Taurus': 7,
 'Technology': 3,
 'Virgo': 4,
 'female': 57,
 'indUnk': 39,
 'male': 39}

### Transform the labels

In [30]:
from sklearn.preprocessing import MultiLabelBinarizer

In [31]:
mlb = MultiLabelBinarizer()

In [32]:
train_labels_transformed = mlb.fit_transform(trainY)

In [74]:
train_labels_transformed

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

As can be seen from above the train labels has been transformed into array of binary matrix.

###  Choose a classifier

In [33]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [34]:
clf = LogisticRegression(solver='lbfgs', max_iter=1000)
clf = OneVsRestClassifier(clf)

In [35]:
clf.fit(trainX_cv, train_labels_transformed)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [36]:
test_labels_transformed = mlb.transform(testY)

In [37]:
test_labels_transformed.shape

(1250, 54)

### Fit the classifier, make predictions and get the accuracy

In [38]:
textX_cv_arr = testX_cv.toarray()

In [39]:
predictions = clf.predict(textX_cv_arr)

In [40]:
predictions

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0]])

In [41]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

Print the accuracy, f1-score, average precisionand recall score

In [91]:
print("Accuracy score", accuracy_score(test_labels_transformed, predictions))

Accuracy score 0.5296


In [90]:
print("f1 score \n", f1_score(test_labels_transformed, predictions, average="micro"))

f1 score 
 0.739183628072517


In [89]:
print("Average precision score \n", average_precision_score(test_labels_transformed, predictions, average="micro"))

Average precision score 
 0.578205957849606


In [88]:
print("Recall score \n", recall_score(test_labels_transformed, predictions, average="micro"))

Recall score 
 0.6646


###  Print true label and predicted label for any five examples

In [57]:
# Getting the random items in  the test data set
import random
randomlist = []
for i in range(0,5):
  n = random.randint(1,testX.size)
  randomlist.append(n)
print(randomlist)

[1097, 410, 797, 208, 530]


In [59]:
# Function to get the Predicted classes for the index from the predictions made
def getPredictedClasses(index):
  label = []
  i = 0
  for val in predictions[index]:
    if val == 1:
      label.append(mlb.classes_[i])
    i = i + 1
  return label

#### Print the actual and predicted labels for the 5 randomly selected values

In [62]:
print("Actual label  ----> ", testY.values[randomlist[0]]) 
print("Predicted label  ----> ", getPredictedClases(randomlist[0]))

Actual label  ---->  ['female', '34', 'indUnk', 'Sagittarius']
Predicted label  ---->  ['34', 'Sagittarius', 'female', 'indUnk']


In [63]:
print("Actual label  ----> ", testY.values[randomlist[1]]) 
print("Predicted label  ----> ", getPredictedClases(randomlist[1]))

Actual label  ---->  ['female', '37', 'indUnk', 'Aquarius']
Predicted label  ---->  ['Aquarius', 'indUnk', 'male']


In [64]:
print("Actual label  ----> ", testY.values[randomlist[2]]) 
print("Predicted label  ----> ", getPredictedClases(randomlist[2]))

Actual label  ---->  ['female', '34', 'indUnk', 'Sagittarius']
Predicted label  ---->  ['34', 'Sagittarius', 'female', 'indUnk']


In [65]:
print("Actual label  ----> ", testY.values[randomlist[3]]) 
print("Predicted label  ----> ", getPredictedClases(randomlist[3]))

Actual label  ---->  ['female', '27', 'Education', 'Aquarius']
Predicted label  ---->  ['27', 'Aquarius', 'Education', 'Student', 'female']


In [66]:
print("Actual label  ----> ", testY.values[randomlist[4]]) 
print("Predicted label  ----> ", getPredictedClases(randomlist[4]))

Actual label  ---->  ['male', '35', 'Technology', 'Aries']
Predicted label  ---->  ['35', 'Aries', 'Technology', 'male']
