# **Submitted by - Shekar Roy**

# **Project 1/2**

- **DOMAIN**:  Digital content management
- **CONTEXT**: Classification  is  probably  the  most  popular  task  that  you  would  deal  with  in  real  life.  Text  in  the  form  of  blogs,  posts,  articles, etc.  is  written  every  second.  It  is  a  challenge  to  predict  the  information  about  the  writer  without  knowing  about  him/her.  We  are  going  to create a classifier that predicts multiple features of the author of a given text. We have designed it as a Multi label classification problem

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
project_path  = '/content/drive/MyDrive/NLP Project'

## Import and analyse the data set.

In [4]:
from zipfile import ZipFile
with ZipFile(project_path+'/blogtext.csv.zip', 'r') as z:
  z.extractall()

In [5]:
data = pd.read_csv('blogtext.csv').dropna()

In [6]:
data.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [7]:
nRow,nCol = data.shape
print('There are ', nRow ,'rows', 'and',nCol ,'columns.'  )

There are  681284 rows and 7 columns.


In [8]:
# Google Colab is crashing so we have to reduce the size of data
data1 = data[0:1000].copy()

In [9]:
data1.drop(['id','date'],axis= 1,inplace = True)

In [10]:
data1.columns

Index(['gender', 'age', 'topic', 'sign', 'text'], dtype='object')

In [11]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   gender  1000 non-null   object
 1   age     1000 non-null   int64 
 2   topic   1000 non-null   object
 3   sign    1000 non-null   object
 4   text    1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 46.9+ KB


In [12]:
data1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1000.0,23.559,6.479545,14.0,17.0,24.0,26.0,45.0


In [13]:
data1['sign'].value_counts()

Scorpio        243
Aquarius       222
Libra          179
Sagittarius     98
Capricorn       77
Cancer          70
Aries           46
Leo             39
Gemini          15
Taurus           7
Virgo            2
Pisces           2
Name: sign, dtype: int64

In [14]:
data1['topic'].value_counts()

indUnk                  370
Student                 156
Engineering             119
Education                85
Sports-Recreation        75
InvestmentBanking        70
Non-Profit               46
Science                  29
BusinessServices         21
Communications-Media     14
Banking                  13
Arts                      2
Name: topic, dtype: int64

In [15]:
data1['text'][44]

"             Koreans have an interesting saying when referring to guys like me: 기러기아빠 ( urlLink gi-reo-gi  a-ppa, wild goose dad).  Whenever I told Koreans that I was here and my wife and kids are in Vancouver they would call me this, and now I know why.  My boss explained it to me quite well (although he called the bird a  urlLink seagull , as many Koreans do).  It goes like this: a goose flies very high in the sky (so does a vulture, but I guess that's not as poetic, nor as flattering) and has very good eyesight.  Therefore, it can see things waaaay in the distance--which is good, because that's where a 기러기아빠's family is...across the ocean.  It is used for Koreans who go abroad to work, as many do in the Middle East on the many Korean-lead construction projects there, for the many families that have the wife and kids in a foreign place (especially Canada, especially Vancouver...there are so many there) and the dad stays in Seoul, and for guys like me who come to work but whose wife 

## 2. Preprocess rows of the “text” column

##### a. Remove unwanted characters

In [16]:
import re
#data1['text'] = data1['text'].replace(r'[^A-Za-z0-9 ]+', '', regex=True)
def Remove_char(text):
  text = re.sub('[^\w\s]+', " ", text)
  text = re.sub('[^A-Za-z0-9 ]+', " ", text)
  return text

data1['text'] = data1['text'].apply(Remove_char)


In [17]:
#Lets check those symbols in the row which we saw earlier
data1['text'][44]

'             Koreans have an interesting saying when referring to guys like me      urlLink gi reo gi  a ppa  wild goose dad   Whenever I told Koreans that I was here and my wife and kids are in Vancouver they would call me this  and now I know why   My boss explained it to me quite well  although he called the bird a  urlLink seagull   as many Koreans do   It goes like this  a goose flies very high in the sky  so does a vulture  but I guess that s not as poetic  nor as flattering  and has very good eyesight   Therefore  it can see things waaaay in the distance which is good  because that s where a   s family is across the ocean   It is used for Koreans who go abroad to work  as many do in the Middle East on the many Korean lead construction projects there  for the many families that have the wife and kids in a foreign place  especially Canada  especially Vancouver there are so many there  and the dad stays in Seoul  and for guys like me who come to work but whose wife likes Vancouver

##### b.Convert text to lowercase

In [18]:
data1['text'] = data1['text'].str.lower()

In [19]:
data1['text'][44]

'             koreans have an interesting saying when referring to guys like me      urllink gi reo gi  a ppa  wild goose dad   whenever i told koreans that i was here and my wife and kids are in vancouver they would call me this  and now i know why   my boss explained it to me quite well  although he called the bird a  urllink seagull   as many koreans do   it goes like this  a goose flies very high in the sky  so does a vulture  but i guess that s not as poetic  nor as flattering  and has very good eyesight   therefore  it can see things waaaay in the distance which is good  because that s where a   s family is across the ocean   it is used for koreans who go abroad to work  as many do in the middle east on the many korean lead construction projects there  for the many families that have the wife and kids in a foreign place  especially canada  especially vancouver there are so many there  and the dad stays in seoul  and for guys like me who come to work but whose wife likes vancouver

##### c. Remove unwanted spaces

In [20]:
# replace more than 1 space with 1 space
data1['text']= data1['text'].str.replace(r"\s\s+",' ') 
# delete beginning and trailing spaces
data1['text']= data1['text'].str.strip() 


In [21]:
data1['text'][44]

'koreans have an interesting saying when referring to guys like me urllink gi reo gi a ppa wild goose dad whenever i told koreans that i was here and my wife and kids are in vancouver they would call me this and now i know why my boss explained it to me quite well although he called the bird a urllink seagull as many koreans do it goes like this a goose flies very high in the sky so does a vulture but i guess that s not as poetic nor as flattering and has very good eyesight therefore it can see things waaaay in the distance which is good because that s where a s family is across the ocean it is used for koreans who go abroad to work as many do in the middle east on the many korean lead construction projects there for the many families that have the wife and kids in a foreign place especially canada especially vancouver there are so many there and the dad stays in seoul and for guys like me who come to work but whose wife likes vancouver soooo much she can t bear to live in seoul and pu

In [22]:
#text = data1['text'].to_string()
text = data1['text']
data1['text'][0]

'info has been found 100 pages and 4 5 mb of pdf files now i have to wait untill our team leader has processed it and learns html'

In [23]:
# Python program to Remove all  
# digits from a list of string 
import re   
def remove(text): 
    pattern = '[0-9]'
    list = [re.sub(pattern, '', i) for i in text] 
    return list  
  # Driver code 
text = remove(text)


In [24]:
data1['text'] = text
data1['text'][0]

'info has been found  pages and   mb of pdf files now i have to wait untill our team leader has processed it and learns html'

In [25]:
# replace more than 1 space with 1 space
data1['text']= data1['text'].str.replace(r"\s\s+",' ') 
data1['text'][0]

'info has been found pages and mb of pdf files now i have to wait untill our team leader has processed it and learns html'

In [26]:
# extracting the stopwords from nltk library
sw = stopwords.words('english')
# displaying the stopwords
np.array(sw)

array(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
       "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
       'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
       'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
       'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
       'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are',
       'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
       'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and',
       'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at',
       'by', 'for', 'with', 'about', 'against', 'between', 'into',
       'through', 'during', 'before', 'after', 'above', 'below', 'to',
       'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
       'again', 'further', 'then', 'once', 'here', 'there', 'when',
       'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'm

In [27]:
print("Number of stopwords: ", len(sw))

Number of stopwords:  179


In [28]:
def stopwords(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)


In [29]:
#Apply function to each example
data1['text'] = data1['text'].apply(stopwords)

In [30]:
data1['text'][44]

'koreans interesting saying referring guys like urllink gi reo gi ppa wild goose dad whenever told koreans wife kids vancouver would call know boss explained quite well although called bird urllink seagull many koreans goes like goose flies high sky vulture guess poetic flattering good eyesight therefore see things waaaay distance good family across ocean used koreans go abroad work many middle east many korean lead construction projects many families wife kids foreign place especially canada especially vancouver many dad stays seoul guys like come work whose wife likes vancouver soooo much bear live seoul put children torture living another part korea things make lot sense first seem clearer get whole story urllink majestic beautiful canada goose'

In [31]:
# Before combining check for any null values in the dataframe. Though we had dropped nan earlier.
data1[data1.isnull() == True].count()

gender    0
age       0
topic     0
sign      0
text      0
dtype: int64

In [32]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   gender  1000 non-null   object
 1   age     1000 non-null   int64 
 2   topic   1000 non-null   object
 3   sign    1000 non-null   object
 4   text    1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 86.9+ KB


In [33]:
#Change the datatype from object to int
data1['age'] = data1['age'].astype(str)

In [34]:
data1['labels'] = data1['gender']+','+ data1['age'] +','+ data1['topic'] +','+ data1['sign']

In [35]:
data1.drop(columns =['gender','age','topic','sign'],axis=1,inplace = True)

In [36]:
data1.head(5)

Unnamed: 0,text,labels
0,info found pages mb pdf files wait untill team...,"male,15,Student,Leo"
1,team members drewes van der laag urllink mail ...,"male,15,Student,Leo"
2,het kader van kernfusie op aarde maak je eigen...,"male,15,Student,Leo"
3,testing testing,"male,15,Student,Leo"
4,thanks yahoo toolbar capture urls popups means...,"male,33,InvestmentBanking,Aquarius"


In [37]:
# number of classes in our data set 

data1.labels.nunique()

32

In [38]:
lemmatizer = WordNetLemmatizer()
def lemmafun(text):
  text = [lemmatizer.lemmatize(word)for word in text.split()]
  return " ".join(text)


In [39]:
data1['text'] = data1['text'].apply(lemmafun)
data1.head(5)

Unnamed: 0,text,labels
0,info found page mb pdf file wait untill team l...,"male,15,Student,Leo"
1,team member drewes van der laag urllink mail r...,"male,15,Student,Leo"
2,het kader van kernfusie op aarde maak je eigen...,"male,15,Student,Leo"
3,testing testing,"male,15,Student,Leo"
4,thanks yahoo toolbar capture url popups mean s...,"male,33,InvestmentBanking,Aquarius"


## Resizing dataframe

In [40]:
# Setting it to max
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

### Stemming operations

In [41]:
stemmer = SnowballStemmer("english")

def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)


In [42]:
#Apply the function to each examples
data1['text'] = data1['text'].apply(stemming)
data1.head(10)

Unnamed: 0,text,labels
0,info found page mb pdf file wait until team leader process learn html,"male,15,Student,Leo"
1,team member drew van der laag urllink mail ruiyu xie urllink mail bryan aalder urllink mail,"male,15,Student,Leo"
2,het kader van kernfusi op aard maak je eigen waterstofbom build h bomb ascott tartarus uwa edu au andrew scott newsgroup rec humor subject build h bomb humor date feb gmt organ univers western australia origin file date th novemb seem transcript seven day articl poor format corrupt ad text examin microscop malleabl like gold miss anyon full text pleas distribut respons accuraci inform convert html dionisio infinet com littl spell check minor edit stolen urllink http ohio voyag net dionisio fun h bomb html reformat html valid xhtml strict build h bomb make own h bomb kind challeng real american seek want passiv victim nuclear war littl effort activ particip bomb shelter loser want huddl togeth underground eat can spam winner want push button make h bomb big step nuclear assert train call take charg sure enjoy risk headi thrill play nuclear chicken introduct fed clamp progress magazin attempt publish articl manufactur hydrogen bomb piqu curios realli true atom hydrogen bomb technolog simpl could build h bomb kitchen seven day decid find food editor barbara ehrenreich investig report peter biskind photograph jane melnick nuclear scientist michio kaku given three day cook workabl h bomb decid share culinari secret seven day support nuclear terror would prefer die slowli familiar poison like low level radiat microwav ddt dbcp aflatoxin pbbs pbcs food dye rather unexpect say hostag latvian nationalist brandish homemad bomb view real terrorist govern american soviet french chines british hoard h bomb use wors still govern u french german eager peddl advanc nuclear technolog countri like south africa brazil argentina make bomb bomb use world big time nuclear peddler along corpor supplier like general electr westinghous gulf oil thank gag progress nation secur backyard bomb shelter like news heart success h bomb success bomb got bomb made rest frost cake set deton start hydrogen fusion reaction part make bomb step get ingredi uranium basic ingredi bomb uranium atom nucleus split apart releas tremend amount energi size emit neutron go split nearbi uranium nucleus releas energi call chain reaction atom split matter convert energi accord einstein equat e mc better way mark birthday atom firework two kind isotop uranium rare u use bomb common heavier useless u natur uranium contain le percent u order usabl bomb enrich percent u percent u plutonium also use bomb substitut u ten pound u slight le plutonium necessari bomb le ten pound give critic mass purifi enrich natur occur uranium like first big hurdl infinit easi steal readi use enrich uranium plutonium enrich steal uranium hard sound least three sourc enrich uranium plutonium enrich uranium manufactur gaseous diffus plant portsmouth ohio ship liter bottl airplan truck convers plant turn uranium oxid uranium metal liter bottl contain kilogram u bottl typic shipment convers facil exist hematit missouri apollo pennsylvania erwin tennesse kerr mcgee plant crescent oklahoma karen silkwood work convers plant lost lb plutonium enrich uranium stolen plant fuel fabric plant like new san diego lynchburg virginia former kerr mcgee supervisor jame v smith ask silkwood trial secur precaut plant prevent theft testifi none kind guard fenc noth plutonium obtain place like unit nuclear pawl new york nuclear fuel servic erwin tennesse general electr pleasanton california westinghous cheswick pennsylvania nuclear materi equip corpor numec leechburg pennsylvania plant hanfford washington morri illinoi accord roll stone magazin isra involv theft plutonium numec final steal enrich uranium plutonium en rout convers plant fuel fabric plant usual transport air truck form uranium oxid brownish powder resembl instant coffe metal come small chunk call broken button form ship small can stack inch cylind brace weld strut center ordinari gallon steel drum drum weigh pound clear mark fissibl materi danger plutonium typic shipment might go enrich plant portsmouth ohio convers plant hematit missouri kansa citi truck would flown los angel truck general atom plant san diego plan general atom plant file nuclear regulatori commiss read room h street nw washington xerox machin provid conveni public get hold enrich uranium settl commerci grade percent u stolen univers reactor type call triga mark ii secur even casual commerci plant steal uranium seem tacki buy unenrich uranium avail chemic suppli hous pound commerci grade percent enrich avail pound gulf atom enrich quit frank someth pain as need start littl pound commerci grade uranium percent u best need pound u littl kitchen tabl chemistri abl convert solid uranium oxid purchas liquid form done abl separ u need u first pour gallon concentr hydrofluor acid uranium oxid convert uranium tetrafluorid safeti note concentr hydrofluor acid corros eat way glass store plastic use gallon plastic milk contain convert uranium tetrafluorid uranium hexafluorid gaseous form uranium conveni separ isotop u u get hexafluorid form bubbl fluorin gas contain uranium tetrafluorid fluorin avail pressur tank chemic suppli firm care use though fluorin sever time dead chlorin classic world war poison gas chemist recommend carri step stove hood kind use remov unpleas cook odor done chemistri right generous suppli uranium hexafluorid readi enrich old hors buggi day bomb manufactur enrich carri pass uranium hexafluorid hundr mile pipe tube membran u eventu separ u gaseous diffus process call difficult time consum expens gaseous diffus plant cover hundr acr cost neighborhood billion forget easier cheaper way enrich uranium first transform gas liquid subject pressur use bicycl pump make simpl home centrifug fill standard size bucket one quarter full liquid uranium hexafluorid attach six foot rope bucket handl swing rope attach bucket around head fast possibl keep minut slow gradual gentl put bucket floor u lighter risen top skim like cream repeat step requir pound uranium safeti note put enrich uranium hexafluorid one bucket use least two three bucket keep separ corner room prevent prematur build critic mass time convert enrich uranium back metal form easili enough accomplish spoon sever ladl calcium avail tablet form drugstor bucket uranium calcium react uranium hexafluorid produc calcium fluorid colorless salt easili separ pure enrich uranium metal precaut uranium danger radioact amount handl plan make one bomb might wise wear glove lead apron kind buy dental suppli store plutonium one toxic substanc known inhal thousandth gram caus massiv fibrosi lung pain way go even millionth gram lung caus cancer eaten plutonium metabol like calcium go straight bone give alpha particl prevent bone marrow manufactur red blood cell best way avoid inhal plutonium hold breath handl difficult wear mask avoid ingest plutonium oral follow simpl rule never make bomb empti stomach find doze work begin glow dark might wise take blood count prick finger steril pin place drop blood microscop slide cover cover slip examin microscop best result obtain earli morn get leukemia immatur cell releas bloodstream usual number white cell increas though increas might take almost week red blood cell look kind like donut without hole slight smaller white cell nucleus immatur red cell look similar white cell e slight larger nucleus white cell includ immatur one red cell start worri depend upon plan eventu use bomb short life expect might problem step assembl bomb acquir enrich uranium left assembl bomb go find coupl stainless steel salad bowl also want separ pound u two hunk keep apart idea push half uranium insid bowl take one hunk uranium beat insid first bowl uranium malleabl like gold troubl hammer bowl get good fit take anoth five pound hunk uranium fit second stainless steel bowl two bowl u subcrit mass brought togeth forc provid critic mass make bomb go keep respect distanc apart work want go critic least yet hollow bodi old vacuum cleaner place two hemispher bowl insid open end face le seven inch apart use mask tape set posit reason steel bowl vacuum cleaner case wonder help reflect neutron back uranium effici explos loos neutron useless neutron bomb pioneer use say far bomb go almost done final problem figur get two u hemispher smash suffici forc set truli effect fission reaction almost type explos use drive togeth gunpowd exampl easili made home potassium nitrat sulfur carbon get blast cap tnt buy steal construct site best c plastic explos mold around bowl fair safe work might wise shape around extra salad bowl anoth room fit uranium pack bowl particular true winter stray static electr charg might induc ignit c respons bomb maker consid impolit accident destroy neighborhood absolut necessari explos place need hook simpl deton devic batteri switch wire rememb though essenti two charg one side case go simultan put whole thing case old hoover vacuum cleaner finish part process rest easi step make bomb follow direct word wise wast bomb complet pile moder fatal radioact wast like u danger get rid flush leftov toilet worri pollut ocean alreadi much radioact wast bucket make wave whatsoev fastidi type kind never leaf gum seat movi seal nasti stuff coffe can buri backyard like uncl sam neighbor kid habit trampl lawn tell play wast soon find spend time bed go first class like u feel econom pinch want make bomb inexpens possibl conson cours reason yield recip given budget pleas h bomb frill flourish simpl megaton bomb capabl wipe new york metropolitan area san francisco bay area boston forget h bomb good bomb want spend littl money punch bomb consider instead centrifug uranium hand buy commerci centrifug fisher scientif sell one also might want fussier design hiroshima bomb relat crude one fission percent uranium yield kiloton order fission uranium forc explos trigger need even diffus around sphere pressur exert everi point sphere simultan techniqu produc sort simultan deton fashion explos len govern accus julius ethel rosenberg tri steal part put h bomb togeth heart h bomb fusion process sever bomb deton way creat extrem high temperatur million degre c necessari fuse lithium deuterid lid helium lithium nucleus slam deuterium nucleus two helium nucleus creat happen enough deuterium nucleus rapid enough result enorm amount energi energi h bomb worri steal lithium deuterid purchas chemic suppli hous cost pound budget allow substitut lithium hydrid pound need least pound corros toxic powder care place lithium deuterid hydrid glass jar surround four bomb case attach deton go simultan contain whole thing problem place anywher insid old stereo consol discard refriger etc deton set four bomb eight hemispher fission materi slam time creat four critic mass four deton rais temperatur lithium deuterid million degre c fast enough billionth second lithium blown neighborhood nucleus time fuse result least time punch puni bomb level hiroshima million ton tnt v thousand ton part bomb fulli assembl h bomb hous attract consol choic may wonder everi famili answer question accord tast prefer may want explor possibl success pioneer american govern sell bomb make pile money day rise inflat increas unemploy uncertain econom outlook busi make much sens weapon product career forecast cloudi bomb sale may sure way avoid humili receiv welfar unemploy regardless present incom level home h bomb busi invalu incom supplement certain profit altern sell tupperwar pirat girl scout cooki unfortun famili bomb busi big govern alreadi corner larg part world market mean shortag potenti custom raid entebe waterloo hijack mani nationalist group alert new mean get messag across jump chanc get hold h bomb emerg nation ant enough rice sugar buy reactor g e westinghous also shop around may wonder ethic sell nation group whose goal may disapprov take tip govern forget ideolog cash count rememb h bomb sale way escal almost like chain reaction suppos make sale south yemen believ soviet puppet well within day discret inquiri north yemen possibl saudi egyptian ethiopian well expect similar sale ira generat sale ulster govern sale tanzanian bring ugandan run forth matter side mani side forget possibl repeat sale custom experi u u r shown individu nation potenti infinit need h bomb custom matter small ever mani use bomb home mani famili attract h bomb simpli deterr discret sticker door live room window say home protect h bomb discourag ir investig census taker jehovah wit surpris fast crime rate go properti valu go news get home h bomb owner find unexpect leverag neighborhood disput everyth park place stereo nois level school tax rate relax enjoy pride excit home h bomb ownership let honest h bomb everyon frank peopl handl break hive mention mega death fallout radiat sick follow quiz help find whether take home h bomb ownership answer yes six question emot elig join nuclear club convent weapon may cup tea tri botul toxin laser ray nerv gas ignor demand other subscrib one follow soldier fortun hustler popular mechan self though mani interest acquaint best friend know say say hello seldom interest pursu convers seen movi deer hunter know everyon winner want resent whiner one follow handgun video game trash compactor snowmobil convinc leukemia psychosomat awar vegetarian sexual impot read evid solar energi communist conspiraci myth nuclear war ever sinc first mushroom cloud hiroshima usher atom age small group nay sayer doom monger lobbi campaign demonstr convinc american h bomb ownership along nuclear power danger unhealthi use virtual stranglehold medium peopl tri discredit everyth nuclear energi war vast overr risk nuclear bomb left mani american feel demor indecis sure truth lie well myth fact myth nuclear exchang earth longer suitabl human habit fact complet fals accord one scientist quot john mcpee curv bind energi largest bomb ever explod anywher megaton one thousandth forc earthquak one thousandth forc hurrican live earthquak hurrican long time anoth scientist add often assum full blown nuclear war would end life earth far truth end life earth would take least thousand time total yield nuclear explos exist world probabl lot even human succumb mani form life would surviv nuclear free cockroach certain form bacteria lichen instanc myth radiat bad fact everyth bad much eat mani banana get stomach ach get much sun get sunburn even skin cancer thing radiat much may make feel weather nuclear industri offici insist evid low level radiat realli serious advers effect high level radiat may bring unexpect benefit speed evolut weed unwant genet type creat new one rememb old say two head better one nearer home plain radiat get rid peski crab grass weed teenag find brief exposur nuclear burst vapor acn skin blemish mani survivor hiroshima bomb found free skin attend problem forev hope clear misconcept may enjoy h bomb,"male,15,Student,Leo"
3,test test,"male,15,Student,Leo"
4,thank yahoo toolbar captur url popup mean show cool link korean pop k pop audio video without need relat instruct like go site click pop audio button choos without ado link hour k pop urllink audio urllink video stream enjoy,"male,33,InvestmentBanking,Aquarius"
5,interest convers dad morn talk korean put money invari lot real estat cash cash would includ short term invest one year well save account reason real estat make money lot money seen survey seoul real estat rise per year long stretch even take account crisi refer imf crisi although imf bail korea compar korean corpor bond fell modest recov local stock market repres kospi version dow jone index gone appreci high point point see urllink link see real estat make sens back convers note real big elit real estat investor billion usd see urllink convert properti dad seem littl flabbergast heck need million dollar need much retir mayb lot risk take real estat south korean asset exampl north toot horn louder make move countri usd worth cent also denomin imf crisi drop vi vi usd also make bad invest fall victim scam latest urllink good morn citi project toast saw ladi tv lost everyth comment tear know like go rich person beggar one day one saber rattl north korea weak exchang rate littl nest egg could almost wipe govern almost zero help unemploy disabl otherwis disenfranchis worker role famili import money help famili go first thus idea korean go thing invest differ one apart urllink jeons system support well see usd apart rent two system use korea neither western one except rare circumst renter sign year contract deposit half market valu usd owner month rent paid owner invest korean treasuri bill per year month rent return end term usd return renter renter sign year year contract deposit market valu properti usd plus month rent month case valu properti increas decreas jeons need top partial refund cours use usd save key help foreign refer better thus buy place turn around rent get like buy anoth place whatev sinc mortgag korea kind cash societi although home equiti line credit system bit differ key cours real estat price keep go,"male,33,InvestmentBanking,Aquarius"
6,somehow coca cola way sum thing well earli flagship jingl like buy world coke tune like teach world sing pretti much sum post woodstock era well add much sale catchi tune korea coke theme urllink stop think feel pretti much sum lot korea korean look relax coupl stop think start feel cours high regard educ math logic deep think mani korean realli like work emot anyth els western seem sublim moreso least display differ way mayb scratch western korean probabl pretti similar context differ anyway think lose korea repeat stop think feel stop think feel stop think feel everyth alright,"male,33,InvestmentBanking,Aquarius"
7,anyth korea countri extrem everyth seem fad base think may come korea histori invad report time year time got independ imagin move quick get next level next war occup late well realli late japanes occup end korean war occur turmoil park chung hee took dictat presid elect everyon encourag vote still dictat assassin next leader basic ilk presid park amaz thing time howev took incred backward countri set road industri japan strip korea resourc peopl even languag cultur mani build palac raze japanes offici languag presid park determin chang orchestr han river miracl han river hangang main river seoul korea korea made terrif stride expens civil liberti fastforward present point see korea world wire nation canada finland way beyond u craze pc pc bang room everywher countri well instead playstat like game player go comput one two peopl korean gamer alway communal type play onlin game hundr thousand other typic korean fashion gamer left seat second could paus game fact may elimin pcbang owner sold drink ramen nooodl junk clientel matter time someon die relat urllink articl yes someon die pc peopl thought sleep imagin long realli check ala first known casulti net korea mayb world korea built extrem good bad,"male,33,InvestmentBanking,Aquarius"
8,take read news articl urllink joongang ilbo north korea opinion scale troop seoul korea specif troop head iraq end current total drop north blast u troop cut north korea blast unit state yesterday call increas u defens capabl plan reloc u troop south korea iraq pyeongyang state run televis network said plan prepar second korean war even though part u ground forc transfer iraq instead caus capabl vacuum u presid bush augment war polici north broadcast complain u secur commit south korea commit preemptiv war u current affair commentari said ok let get straight move troop south korea area closest north korea dmz demilitar zone somehow increas u defens capabl south korea well read lot north korean propaganda urllink dprk com check dprk news section one pretti unreal like remind everyon dprk tell mean democrat peopl republ korea north korea probabl good peopl leadership seem make wave could wrong go go find korean war brutal event sever mani famili lucki enough see age relat rememb year special reunion project albeit hour everyon whisk respect countri mani howev pas without ever see sister brother child parent boy girlfriend west especi canada never war soil except war could never fulli comprehend thing realli make wonder far one would go support ideolog guess idea,"male,33,InvestmentBanking,Aquarius"
9,surf english news site lot look tidbit korea foreign like view hermit kingdom also way keep fast move place sometim though one need check verac figur put paper especi local one two exampl english version korea time joongang ilbo daili first pretti straightforward urllink korea time said peopl arrest forg korean passport urllink joongang ilbo say accus huh anoth one urllink joongang ilbo said p posit korean bank good thing urllink korea time said p tad worri bad loan bank extend small medium size firm idea simpl fact seem present differ simpli translat,"male,33,InvestmentBanking,Aquarius"


## Design, train, tune and test the best text classifier

In [43]:
X = data1['text']
y = data1['labels']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [45]:
X_train.shape

(700,)

In [46]:
X_test.shape

(300,)

In [47]:
type(X_train)

pandas.core.series.Series

#### Vectorization

In [48]:
# Create our vectorizer
vectorizer = CountVectorizer(ngram_range=(1,2),stop_words='english')

# Create the vectorizer
X_train_vector = vectorizer.fit_transform(X_train)

In [49]:
# Let's look at the vocabulary:
print('Vocabulary: ')
print(vectorizer.vocabulary_)


Vocabulary: 
{'happi': 29421, 'birthday': 6199, 'say': 59173, 'know': 35852, 'titl': 71138, 'post': 52737, 'mean': 42846, 'dear': 16215, 'colleg': 12245, 'friend': 25138, 'live': 39716, 'year': 79588, 'crazi': 14665, 'parti': 49472, 'chill': 10910, 'met': 43549, 'think': 69710, 'like': 38496, 'honest': 31312, 'abrupt': 123, 'time': 70648, 'roommat': 58061, 'went': 77172, 'home': 31130, 'extend': 21839, 'ill': 32630, 'becam': 5183, 'good': 27067, 'saw': 59079, 'kinder': 35604, 'gentler': 26326, 'end': 20040, 'stay': 65531, 'bed': 5271, 'quit': 54673, 'freak': 24928, 'natur': 45554, 'need': 46024, 'away': 4008, 'hall': 29083, 'glad': 26659, 'life': 38209, 'kind': 35470, 'matter': 42556, 'help': 30349, 'new': 46319, 'apart': 2669, 'celebr': 10072, 'graduat': 27780, 'judg': 34876, 'confront': 13297, 'peopl': 50118, 'strang': 66172, 'make': 41547, 'world': 78803, 'round': 58122, 'use': 74349, 'rag': 54929, 'word': 78411, 'mommi': 44455, 'preced': 53100, 'everi': 21000, 'piec': 51227, 'factu

In [50]:
## summarize encoded vector
print(X_train_vector.shape)
print(type(X_train_vector ))

(700, 80322)
<class 'scipy.sparse.csr.csr_matrix'>


In [51]:
X_train_vector

<700x80322 sparse matrix of type '<class 'numpy.int64'>'
	with 137802 stored elements in Compressed Sparse Row format>

In [52]:
X_train_array = X_train_vector.toarray()

In [53]:
X_train_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [54]:
X_test_vector  = vectorizer.transform(X_test)

In [55]:
print(X_test_vector.shape)
print(type(X_test_vector ))

(300, 80322)
<class 'scipy.sparse.csr.csr_matrix'>


In [56]:
X_test_vector

<300x80322 sparse matrix of type '<class 'numpy.int64'>'
	with 31168 stored elements in Compressed Sparse Row format>

In [57]:
X_test_array = X_test_vector.toarray()

In [58]:
X_test_array .shape

(300, 80322)

In [59]:
# create the transform
tfidf = TfidfVectorizer(stop_words= 'english')
# tokenize and build vocab
tfidf.fit(X_train)
# summarize
print(tfidf.vocabulary_)
print(tfidf.idf_)

{'happi': 4027, 'birthday': 866, 'say': 7792, 'know': 4990, 'titl': 9244, 'post': 6881, 'mean': 5591, 'dear': 2226, 'colleg': 1729, 'friend': 3524, 'live': 5267, 'year': 10251, 'crazi': 2019, 'parti': 6488, 'chill': 1513, 'met': 5668, 'think': 9153, 'like': 5219, 'honest': 4255, 'abrupt': 29, 'time': 9222, 'roommat': 7626, 'went': 9988, 'home': 4241, 'extend': 3086, 'ill': 4419, 'becam': 744, 'good': 3777, 'saw': 7790, 'kinder': 4951, 'gentler': 3664, 'end': 2880, 'stay': 8599, 'bed': 752, 'quit': 7184, 'freak': 3501, 'natur': 5976, 'need': 6003, 'away': 585, 'hall': 3993, 'glad': 3721, 'life': 5205, 'kind': 4949, 'matter': 5559, 'help': 4141, 'new': 6040, 'apart': 366, 'celebr': 1393, 'graduat': 3811, 'judg': 4841, 'confront': 1846, 'peopl': 6597, 'strang': 8678, 'make': 5449, 'world': 10153, 'round': 7643, 'use': 9683, 'rag': 7216, 'word': 10143, 'mommi': 5804, 'preced': 6929, 'everi': 3007, 'piec': 6705, 'factual': 3118, 'advic': 121, 'laden': 5043, 'talk': 8966, 'great': 3850, 'job

In [60]:
# encode document
X_train_tfidf = tfidf.transform(X_train)
# summarize encoded vector
print(X_train_tfidf.shape)

(700, 10350)


In [61]:
x3 = X_train_tfidf
x3

<700x10350 sparse matrix of type '<class 'numpy.float64'>'
	with 59547 stored elements in Compressed Sparse Row format>

In [62]:
X_test_tfidf  = tfidf.transform(X_test)
print(X_test_tfidf.shape)

(300, 10350)


In [63]:
x4 = X_test_tfidf
x4

<300x10350 sparse matrix of type '<class 'numpy.float64'>'
	with 25340 stored elements in Compressed Sparse Row format>

##### Create a dictionary to get the count of every label

In [64]:
data1['labels'].head(5)

0    male,15,Student,Leo               
1    male,15,Student,Leo               
2    male,15,Student,Leo               
3    male,15,Student,Leo               
4    male,33,InvestmentBanking,Aquarius
Name: labels, dtype: object

In [65]:
top_N = 100 # to avoid session crash
a = data1['labels'].str.cat(sep=',')
a = a. replace(',',' ')

In [66]:
word = nltk.tokenize.word_tokenize(a)
word_dist = nltk.FreqDist(word)
print (word_dist)
rslt = pd.DataFrame(word_dist.most_common(top_N),columns=['Word', 'Frequency'])
rslt.head(5)


<FreqDist with 40 samples and 4000 outcomes>


Unnamed: 0,Word,Frequency
0,male,575
1,female,425
2,indUnk,370
3,24,331
4,Scorpio,243


In [67]:
type(word_dist)

nltk.probability.FreqDist

In [68]:
dict_rslt = pd.Series(rslt.Frequency.values,index= rslt.Word).to_dict()
dict_rslt

{'14': 70,
 '15': 74,
 '17': 142,
 '23': 69,
 '24': 331,
 '25': 57,
 '26': 35,
 '27': 85,
 '33': 83,
 '34': 6,
 '37': 19,
 '41': 14,
 '44': 1,
 '45': 14,
 'Aquarius': 222,
 'Aries': 46,
 'Arts': 2,
 'Banking': 13,
 'BusinessServices': 21,
 'Cancer': 70,
 'Capricorn': 77,
 'Communications-Media': 14,
 'Education': 85,
 'Engineering': 119,
 'Gemini': 15,
 'InvestmentBanking': 70,
 'Leo': 39,
 'Libra': 179,
 'Non-Profit': 46,
 'Pisces': 2,
 'Sagittarius': 98,
 'Science': 29,
 'Scorpio': 243,
 'Sports-Recreation': 75,
 'Student': 156,
 'Taurus': 7,
 'Virgo': 2,
 'female': 425,
 'indUnk': 370,
 'male': 575}

In [69]:
d_Train = pd.DataFrame(data = y_train)
d_Train.head(2)

Unnamed: 0,labels
541,"female,27,Education,Aquarius"
440,"female,24,indUnk,Scorpio"


In [70]:
mlb = MultiLabelBinarizer()

In [71]:
y_train_transformed = pd.DataFrame(mlb.fit_transform(d_Train.labels.str.split(',')),columns=mlb.classes_,index=d_Train.labels).reset_index()

In [72]:
y_train_transformed.head(2)

Unnamed: 0,labels,14,15,17,23,24,25,26,27,33,34,37,41,44,45,Aquarius,Aries,Arts,Banking,BusinessServices,Cancer,Capricorn,Communications-Media,Education,Engineering,Gemini,InvestmentBanking,Leo,Libra,Non-Profit,Pisces,Sagittarius,Science,Scorpio,Sports-Recreation,Student,Taurus,Virgo,female,indUnk,male
0,"female,27,Education,Aquarius",0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,"female,24,indUnk,Scorpio",0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0


In [73]:
y_train_transformed.drop(['labels'],inplace = True,axis = 1)

In [74]:
y_train_transformed = y_train_transformed.to_numpy()

In [75]:
y_train_transformed

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 0, 0]])

In [76]:
y_train_transformed.shape

(700, 40)

In [77]:
d_Test = pd.DataFrame(data = y_test)
d_Test.head(2)

Unnamed: 0,labels
521,"female,27,Education,Aquarius"
737,"male,41,Communications-Media,Libra"


In [78]:
y_test_transformed = pd.DataFrame(mlb.transform(d_Test.labels.str.split(',')),columns=mlb.classes_,index=d_Test.labels).reset_index()

In [79]:
y_test_transformed.head(2)

Unnamed: 0,labels,14,15,17,23,24,25,26,27,33,34,37,41,44,45,Aquarius,Aries,Arts,Banking,BusinessServices,Cancer,Capricorn,Communications-Media,Education,Engineering,Gemini,InvestmentBanking,Leo,Libra,Non-Profit,Pisces,Sagittarius,Science,Scorpio,Sports-Recreation,Student,Taurus,Virgo,female,indUnk,male
0,"female,27,Education,Aquarius",0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,"male,41,Communications-Media,Libra",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1


In [80]:
y_test_transformed .drop(['labels'],inplace = True,axis = 1)

In [81]:
y_test_transformed  = y_test_transformed.to_numpy()

In [82]:
y_test_transformed.shape

(300, 40)

In [83]:
y_test_transformed 

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [84]:
type(X_train_array)

numpy.ndarray

In [85]:
type(y_train_transformed)

numpy.ndarray

In [86]:
clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

In [87]:
clf.fit(X_train_array, y_train_transformed)

OneVsRestClassifier(estimator=LogisticRegression())

In [88]:
prediction = clf.predict(X_test_array)

In [89]:
y_test_transformed.shape

(300, 40)

In [90]:
prediction.shape

(300, 40)

In [91]:
prediction

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0]])

### Fit the classifier, make predictions and get the accuracy

In [92]:
print(accuracy_score(y_test_transformed,prediction))

0.29333333333333333


In [93]:
print(classification_report(y_test_transformed,prediction))

              precision    recall  f1-score   support

           0       1.00      0.20      0.33        20
           1       0.80      0.18      0.30        22
           2       0.93      0.58      0.71        43
           3       1.00      0.30      0.46        20
           4       0.80      0.64      0.71        99
           5       0.00      0.00      0.00        15
           6       0.50      0.08      0.13        13
           7       0.57      0.14      0.22        29
           8       0.95      0.75      0.84        24
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         8
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         2
          14       0.81      0.41      0.54        71
          15       1.00      0.15      0.27        13
          16       0.00      0.00      0.00         1
          17       0.00    



Tip: Make sure you are familiar with all of them. How would you expect the things to work for the multi-label scenario? Read about micro/macro/weighted averaging

In multiclass and multilabel classification task, the notions of precision, recall, and F-measures can be applied to each label independently.

The classification report displays the precision, recall, F1, and support scores for the model.

Precision: Precision is the ability of a classiifer not to label an instance positive that is actually negative. For each class it is defined as as the ratio of true positives to the sum of true and false positives. Said another way, “for all instances classified positive, what percent was correct?”

Recall : Recall is the ability of a classifier to find all positive instances. For each class it is defined as the ratio of true positives to the sum of true positives and false negatives. Said another way, “for all instances that were actually positive, what percent was classified correctly?”

F1-Score:The F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0. Generally speaking, F1 scores are lower than accuracy measures as they embed precision and recall into their computation. As a rule of thumb, the weighted average of F1 should be used to compare classifier models, not global accuracy.Similar to arithmetic mean, the F1-score will always be somewhere in between precision and mean. But it behaves differently: the F1-score gives a larger weight to lower numbers. For example, when Precision is 100% and Recall is 0%, the F1-score will be 0%, not 50%. Or for example, say that Classifier A has precision=recall=80%, and Classifier B has precision=60%, recall=100%. Arithmetically, the mean of the precision and recall is the same for both models. But when we use F1’s harmonic mean formula, the score for Classifier A will be 80%, and for Classifier B it will be only 75%. Model B’s low precision score pulled down its F1-score.

Support : Support is the number of actual occurrences of the class in the specified dataset. Imbalanced support in the training data may indicate structural weaknesses in the reported scores of the classifier and could indicate the need for stratified sampling or rebalancing. Support doesn’t change between models but instead diagnoses the evaluation process.

Macro-averaged : Combining the per-class F1-scores into a single number, the classifier’s overall F1-score. There are a few ways of doing that. Let’s begin with the simplest one: an arithmetic mean of the per-class F1-scores. This is called the macro-averaged F1-score, or the macro-F1 for short, and is computed as a simple arithmetic mean of our per-class F1-scores: Macro-F1 = (42.1% + 30.8% + 66.7%) / 3 = 46.5% In a similar way, we can also compute the macro-averaged precision and the macro-averaged recall: Macro-precision = (31% + 67% + 67%) / 3 = 54.7% Macro-recall = (67% + 20% + 67%) / 3 = 51.1%

Weighted Ang: When averaging the macro-F1, we gave equal weights to each class. We don’t have to do that: in weighted-average F1-score, or weighted-F1, we weight the F1-score of each class by the number of samples from that class. In our case, we have a total of 25 samples: 6 Cat, 10 Fish, and 9 Hen. The weighted-F1 score is thus computed as follows: Weighted-F1 = (6 × 42.1% + 10 × 30.8% + 9 × 66.7%) / 25 = 46.4% Similarly, we can compute weighted precision and weighted recall: Weighted-precision=(6 × 30.8% + 10 × 66.7% + 9 × 66.7%)/25 = 58.1% Weighted-recall = (6 × 66.7% + 10 × 20.0% + 9 × 66.7%) / 25 = 48.0%

Micro Average: The last variant is the micro-averaged F1-score, or the micro-F1. To calculate the micro-F1, we first compute micro-averaged precision and micro-averaged recall over all the samples , and then combine the two. How do we “micro-average”? We simply look at all the samples together. Remember that precision is the proportion of True Positives out of the Predicted Positives (TP/(TP+FP)). In the multi-class case, we consider all the correctly predicted samples to be True Positives


In [94]:
df = shuffle(data)
check = df[2000:2006]
check.drop(['id','date'],axis = 1,inplace = True)
check['age'] = check['age'].astype(str)
check['labels'] = check['gender']+','+ check['age'] +','+ check['topic'] +','+ check['sign']
check.drop(columns =['gender','age','topic','sign'],axis=1,inplace = True)

check1 = check['text']

In [95]:
# Creating a pipeline to find the prediction of new records.

from sklearn.pipeline import Pipeline
pipe = Pipeline([('vectorizer', vectorizer),('classifier',clf)])
pred = pipe.predict(check1)
#pred.shape

# finding the true labels
b= mlb.inverse_transform(pred)

#COnverting true labels in a list
a = check['labels'].tolist()
a

['female,17,Student,Capricorn',
 'male,26,Technology,Leo',
 'female,23,Fashion,Libra',
 'male,24,Student,Cancer',
 'male,17,Education,Virgo',
 'female,27,indUnk,Pisces']

In [96]:
clf1 = LogisticRegression(solver='lbfgs')
clf1 = OneVsRestClassifier(clf1)

clf1.fit(x3, y_train_transformed)
prediction1 = clf1.predict(x4)

In [97]:
print(accuracy_score(y_test_transformed,prediction1))

0.023333333333333334


In [98]:
print(classification_report(y_test_transformed,prediction1))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        20
           1       0.00      0.00      0.00        22
           2       1.00      0.14      0.24        43
           3       0.00      0.00      0.00        20
           4       0.92      0.22      0.36        99
           5       0.00      0.00      0.00        15
           6       0.00      0.00      0.00        13
           7       0.00      0.00      0.00        29
           8       1.00      0.25      0.40        24
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         8
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         2
          14       1.00      0.15      0.27        71
          15       0.00      0.00      0.00        13
          16       0.00      0.00      0.00         1
          17       0.00    

In [99]:
print('-' * 150) 

------------------------------------------------------------------------------------------------------------------------------------------------------


# **Project 2/2**

  - **DOMAIN**:  Customer support
  - **CONTEXT**: Great  Learning  has  a  an  academic  support  department  which  receives  numerous  support  requests  every  day  throughout  the year.  Teams  are  spread  across  geographies  and  try  to  provide  support  round  the  year.  Sometimes  there  are  circumstances  where  due  to heavy  workload  certain  request  resolutions  are  delayed,  impacting  company’s  business.  Some  of  the  requests  are  very  generic  where  a proper resolution procedure delivered to the user can solve the problem. Company is looking forward to design an automation which can interact with the user, understand the problem and display the resolution procedure [ if found as a generic request ] or redirect the request to an actual human support executive if the request is complex or not in it’s database

  - *Please note, i added a few more terms in few json blocks, so if you are trying to replicate the behavior, you might not see the same outputs.*

In [100]:
import json
import string
import random 
import nltk
import numpy as np
from nltk.stem import WordNetLemmatizer 
import tensorflow as tf 
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [101]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [102]:
# Opening JSON file
with open('/content/drive/MyDrive/NLP Project/GL Bot.json') as json_file:
    data = json.load(json_file)

In [103]:
words = []
classes = []
doc_x = []
doc_y = []

import nltk
from nltk.stem import WordNetLemmatizer
import string
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

for intent in data['intents']:
  for pattern in intent['patterns']:
    tokens = nltk.word_tokenize(pattern)
    words.extend(tokens)
    doc_x.append(pattern)
    doc_y.append(intent['tag'])

    if intent['tag'] not in classes:
      classes.append(intent['tag'])

words = [lemmatizer.lemmatize(word.lower()) for word in words if word not in string.punctuation]

print(words)
print(classes)

words = sorted(set(words))
classes = sorted(set(classes))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
['hi', 'how', 'are', 'you', 'is', 'anyone', 'there', 'hello', 'whats', 'up', 'hey', 'yo', 'listen', 'help', 'please', 'help', 'me', 'i', 'am', 'learner', 'from', 'i', 'belong', 'to', 'aiml', 'batch', 'aifl', 'batch', 'i', 'am', 'from', 'my', 'pm', 'is', 'blended', 'online', 'i', 'am', 'from', 'hey', 'ya', 'talking', 'to', 'you', 'for', 'first', 'time', 'thank', 'you', 'thanks', 'cya', 'bye', 'adios', 'gracias', 'see', 'you', 'later', 'see', 'you', 'later', 'goodbye', 'i', 'am', 'leaving', 'have', 'a', 'good', 'day', 'you', 'helped', 'me', 'thanks', 'a', 'lot', 'thanks', 'a', 'ton', 'you', 'are', 'the', 'best', 'great', 'help', 'too', 'good', 'you', 'are', 'a', 'good', 'learning', 'buddy', 'olympus', 'explain', 'me', 'how', 'olympus', 'work', 'i', 'am', 'not', 'abl

In [104]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [105]:
print(doc_x)
print(doc_y)

['hi', 'how are you', 'is anyone there', 'hello', 'whats up', 'hey', 'yo', 'listen', 'help', 'please help me', 'i am learner from', 'i belong to', 'aiml batch', 'aifl batch', 'i am from', 'my pm is', 'blended', 'online', 'i am from', 'hey ya', 'talking to you for first time', 'thank you', 'thanks', 'cya', 'bye', 'adios', 'gracias', 'see you', 'later', 'see you later', 'goodbye', 'i am leaving', 'have a Good day', 'you helped me', 'thanks a lot', 'thanks a ton', 'you are the best', 'great help', 'too good', 'you are a good learning buddy', 'olympus', 'explain me how olympus works', 'I am not able to understand olympus', 'olympus window not working', 'no access to olympus', 'unable to see link in olympus', 'no link visible on olympus', 'whom to contact for olympus', 'lot of problem with olympus', 'olypus is not a good tool', 'how to use olympus', 'teach me olympus', 'i am not able to understand svm', 'explain me how machine learning works', 'i am not able to understand naive bayes', 'i a

In [106]:
out_empty = [0] * len(classes)
training = []

import numpy as np

for idx, doc in enumerate(doc_x):
  bow = []
  text = lemmatizer.lemmatize(doc.lower())
  for word in words:
    bow.append(1) if word in text else bow.append(0)
  output_row = list(out_empty)
  output_row[classes.index(doc_y[idx])] = 1

  training.append([bow, output_row])

import random

random.shuffle(training)
training = np.array(training, dtype = object)

train_x = np.array(list(training[:,0]))
train_y = np.array(list(training[:,1]))

Model Building

In [107]:
input_shape = (len(train_x[0]),)
output_shape = len(train_y[0])
epochs = 200

model = Sequential()
model.add(Dense(128, input_shape = input_shape, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.3))
model.add(Dense(output_shape, activation = 'softmax'))

adam = tf.keras.optimizers.Adam(learning_rate = 0.01, decay = 1e-6)

model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])
print(model.summary())
model.fit(train_x, train_y, epochs = 200, verbose = 1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               21248     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 8)                 520       
                                                                 
Total params: 30,024
Trainable params: 30,024
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200

<keras.callbacks.History at 0x7f23a7a3ac90>

In [108]:
def clean_text(text):
  tokens = nltk.word_tokenize(text)
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  return tokens

def bag_of_words(text, vocab):
  tokens = clean_text(text)
  bow = [0] * len(vocab)
  for w in tokens:
    for idx, word in enumerate(vocab):
      if word == w:
        bow[idx] = 1
  return np.array(bow)

def pred_class(text, vocab, labels):
  bow = bag_of_words(text, vocab)
  result = model.predict(np.array([bow]))[0]
  thresh = 0.2
  y_pred = [[idx, res] for idx, res in enumerate(result) if res > thresh]

  y_pred.sort(key = lambda x: x[1], reverse = True)
  return_list = []
  for r in y_pred:
    return_list.append(labels[r[0]])
  return return_list

def get_response(intents_list, intents_json):
  tag = intents_list[0]
  list_of_intents = intents_json['intents']
  for i in list_of_intents:
    if i['tag'] == tag:
      result = random.choice(i['responses'])
      break
  return result


In [109]:
while True:
  message = input('')
  intents = pred_class(message, words, classes)
  result = get_response(intents, data)
  print('Bot -> '+ result)

hi
Bot -> Hello i am your GL virtual assistant! how can i help you ?
tensorflow
Bot -> Link: Neural Nets wiki
kera
Bot -> Link: Neural Nets wiki
keras
Bot -> Link: Neural Nets wiki
bangalore
Bot -> Hello i am your GL virtual assistant! how can i help you ?
Project manager
Bot -> Hello i am your GL virtual assistant! how can i help you ?
useless
Bot -> Transferring the request to your PM, hold on
nonsense
Bot -> Please abstain from using profanity!
cya
Bot -> I hope I was able to assist you, Good Bye


KeyboardInterrupt: ignored