In [1]:
import nltk
import math, numpy as np

In [2]:
nltk.download('abc')

[nltk_data] Downloading package abc to /home/hackerearth/nltk_data...
[nltk_data]   Unzipping corpora/abc.zip.


True

In [3]:
from nltk.corpus import abc,stopwords
from nltk.stem import PorterStemmer 
from string import punctuation as p
ps=PorterStemmer()

In [4]:
abc.fileids()

['rural.txt', 'science.txt']

In [5]:
rural=abc.words('rural.txt')
science=abc.words('science.txt')

In [6]:
rural

['PM', 'denies', 'knowledge', 'of', 'AWB', 'kickbacks', ...]

In [7]:
science

['Cystic', 'fibrosis', 'affects', '30', ',', '000', ...]

In [17]:
print(len(rural))
print(len(science))

345580
421231


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hackerearth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Removing Stop words 

In [9]:
stop_words=set(stopwords.words("english"))

In [10]:
rural=[word for word in rural if word not in stop_words]

In [11]:
science=[word for word in science if word not in stop_words]

In [12]:
print(len(rural))
print(len(science))

225374
275580


## Removing Punctuation

In [13]:
def punctuation(data):
    punct_num = '''!()-[]{};:\n,<>./?@#$%^&\*_~0123456789=|'"'''
    for i in range(len(data)):
        no_punct = ""
        for char in data[i]:
            if char not in punct_num :
                no_punct = no_punct + char  
        if len(no_punct)==0:
            no_punct=' '
        data[i]=no_punct
    return(data)

In [14]:
rural=punctuation(rural)
science=punctuation(science)

## Removing Whitespaces

In [15]:
rural=list(filter(lambda a:a!=' ',rural))
science=list(filter(lambda a:a!=' ',science))

## Lower Case 

In [16]:
rural=[ w.lower() for w in rural]
science=[w.lower() for w in science]

## Stemming Word 

In [17]:
rural=[ps.stem(w) for w in rural]
science=[ps.stem(w) for w in science]

In [19]:
np.array(rural).T

array(['pm', 'deni', 'knowledg', ..., 'minut', 'anoth', 'said'],
      dtype='<U16')

## Using Library

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf

In [21]:
vector=tfidf()
response=vector.fit_transform(rural,science)

In [22]:
print(response)

  (0, 6513)	1.0
  (1, 2329)	1.0
  (2, 4740)	1.0
  (3, 607)	1.0
  (4, 4696)	1.0
  (5, 8580)	1.0
  (6, 6671)	1.0
  (7, 5513)	1.0
  (8, 2329)	1.0
  (9, 4732)	1.0
  (10, 607)	1.0
  (11, 6285)	1.0
  (12, 4696)	1.0
  (13, 4459)	1.0
  (14, 2374)	1.0
  (15, 9644)	1.0
  (16, 9451)	1.0
  (17, 3054)	1.0
  (18, 505)	1.0
  (19, 4680)	1.0
  (20, 3475)	1.0
  (21, 4343)	1.0
  (22, 4459)	1.0
  (23, 9451)	1.0
  (24, 7433)	1.0
  :	:
  (184058, 7973)	1.0
  (184059, 2105)	1.0
  (184060, 9688)	1.0
  (184061, 5298)	1.0
  (184063, 2731)	1.0
  (184064, 1610)	1.0
  (184065, 5857)	1.0
  (184066, 2731)	1.0
  (184067, 1610)	1.0
  (184068, 6036)	1.0
  (184069, 3793)	1.0
  (184070, 7422)	1.0
  (184071, 3838)	1.0
  (184072, 916)	1.0
  (184073, 1610)	1.0
  (184074, 7730)	1.0
  (184075, 8673)	1.0
  (184076, 3677)	1.0
  (184077, 3596)	1.0
  (184078, 2574)	1.0
  (184079, 4396)	1.0
  (184080, 4838)	1.0
  (184081, 5523)	1.0
  (184082, 366)	1.0
  (184083, 7422)	1.0


In [24]:
np.array(vector.get_feature_names()).T

array(['aac', 'aami', 'aaron', ..., 'zone', 'zoo', 'zulu'], dtype='<U16')

## Building Vocabulary

In [25]:
from nltk.probability import FreqDist

In [26]:
def buildVocabulary(data):
    all_words = []
    for word in data:
        all_words.append(word)
    wordlist = nltk.FreqDist(all_words)
    return wordlist

In [27]:
rural_vocab=buildVocabulary(rural)
science_vocab=buildVocabulary(science)

In [28]:
rural_vocab

FreqDist({'say': 3419, 'the': 2519, 'said': 2088, 'australia': 1586, 'year': 1342, 'farmer': 1270, 'new': 1160, 'industri': 979, 'price': 975, 'water': 960, ...})

## Tf - Idf representation

In [29]:
def tf(vocabulary,data):
    tf={}
    n=len(data)
    for (word,count) in vocabulary.items():
        tf[word]=count/n
    return tf

In [38]:
tf_rural=tf(rural_vocab,rural)
tf_science=tf(science_vocab,science)

In [39]:
tf_rural

{'pm': 0.00010864605288889855,
 'deni': 0.000222724408422242,
 'knowledg': 9.77814476000087e-05,
 'awb': 0.0029660372438669305,
 'kickback': 0.00019013059255557245,
 'the': 0.013683970361356771,
 'prime': 0.0005052041459333782,
 'minist': 0.001722039938289042,
 'knew': 5.432302644444927e-05,
 'pay': 0.0009615175680667521,
 'iraq': 0.0005540948697333826,
 'despit': 0.0007496577649334,
 'write': 5.97553290888942e-05,
 'wheat': 0.003433215271289194,
 'export': 0.0037156950088003305,
 'ask': 0.0006518763173333912,
 'kept': 0.0001195106581777884,
 'fulli': 5.432302644444927e-05,
 'inform': 0.00042915190891114926,
 'sale': 0.001423263292844571,
 'letter': 0.00010864605288889855,
 'john': 0.0008854653310445232,
 'howard': 0.0003205058560222507,
 'deputi': 0.00020642750048890723,
 'mark': 0.000521501053866713,
 'vail': 0.00036939657982225507,
 'releas': 0.0005649594750222725,
 'cole': 0.000820277699311184,
 'inquiri': 0.0012657265161556681,
 'oil': 0.0012657265161556681,
 'food': 0.00158080006

In [40]:
tf_science

{'cystic': 4.0789133774762405e-05,
 'fibrosi': 4.5321259749736004e-05,
 'affect': 0.0007160759040458289,
 'children': 0.0007387365339206969,
 'young': 0.0004396162195724392,
 'adult': 0.0003671022039728616,
 'us': 0.0023068521212615627,
 'alon': 0.00011783527534931361,
 'inhal': 5.8917637674656804e-05,
 'mist': 4.5321259749736004e-05,
 'salt': 8.157826754952481e-05,
 'water': 0.001667822358790285,
 'reduc': 0.0006163691325964096,
 'pu': 9.064251949947201e-06,
 'infect': 0.0007659292897705384,
 'fill': 0.00016768866107402322,
 'airway': 2.7192755849841602e-05,
 'suffer': 0.000249266928623548,
 'although': 0.0005257266130969376,
 'side': 0.00040789133774762405,
 'effect': 0.0015635834613658922,
 'includ': 0.0018219146419393873,
 'nasti': 2.2660629874868002e-05,
 'cough': 2.2660629874868002e-05,
 'fit': 0.00022660629874868003,
 'harsh': 1.3596377924920801e-05,
 'tast': 0.000135963779249208,
 'that': 0.0006480940144212248,
 'conclus': 0.0001722207870489968,
 'two': 0.002474540782335586,
 '

In [41]:
w=rural_vocab
w.update(science_vocab)
words=[rural_vocab,science_vocab]
def idf():
    idf={}
    count=0
    idf=dict.fromkeys(w.keys(),0)
    for i in range(len(words)):
        for (word,count) in words[i].items():
            if(count>0):
                    idf[word]+=1
    for i,j in idf.items():
        idf[i]=math.log(len(words)/j)
    return(idf)

In [42]:
idf_data=idf()

In [43]:
idf_data

{'pm': 0.0,
 'deni': 0.0,
 'knowledg': 0.0,
 'awb': 0.6931471805599453,
 'kickback': 0.6931471805599453,
 'the': 0.0,
 'prime': 0.0,
 'minist': 0.0,
 'knew': 0.0,
 'pay': 0.0,
 'iraq': 0.0,
 'despit': 0.0,
 'write': 0.0,
 'wheat': 0.0,
 'export': 0.0,
 'ask': 0.0,
 'kept': 0.0,
 'fulli': 0.0,
 'inform': 0.0,
 'sale': 0.0,
 'letter': 0.0,
 'john': 0.0,
 'howard': 0.0,
 'deputi': 0.0,
 'mark': 0.0,
 'vail': 0.6931471805599453,
 'releas': 0.0,
 'cole': 0.0,
 'inquiri': 0.0,
 'oil': 0.0,
 'food': 0.0,
 'program': 0.0,
 'in': 0.0,
 'one': 0.0,
 'mr': 0.0,
 'manag': 0.0,
 'director': 0.0,
 'andrew': 0.0,
 'lindberg': 0.6931471805599453,
 'remain': 0.0,
 'close': 0.0,
 'contact': 0.0,
 'govern': 0.0,
 'opposit': 0.0,
 'gavan': 0.6931471805599453,
 'o': 0.0,
 'connor': 0.0,
 'say': 0.0,
 'sent': 0.0,
 'time': 0.0,
 'though': 0.0,
 'jordanian': 0.6931471805599453,
 'truck': 0.0,
 'compani': 0.0,
 'he': 0.0,
 'longer': 0.0,
 'wipe': 0.0,
 'hand': 0.0,
 'illicit': 0.0,
 'payment': 0.0,
 'total': 

In [44]:
def tfidf(tf,idf_data):
    tf_idf={}
    for word,value in tf.items():
        tf_idf[word]=value*idf_data[word]
    return(tf_idf)

In [45]:
tfidf_rural=tfidf(tf_rural,idf_data)
tfidf_science=tfidf(tf_science,idf_data)

In [46]:
tfidf_rural

{'pm': 0.0,
 'deni': 0.0,
 'knowledg': 0.0,
 'awb': 0.002055900353022154,
 'kickback': 0.00013178848416808676,
 'the': 0.0,
 'prime': 0.0,
 'minist': 0.0,
 'knew': 0.0,
 'pay': 0.0,
 'iraq': 0.0,
 'despit': 0.0,
 'write': 0.0,
 'wheat': 0.0,
 'export': 0.0,
 'ask': 0.0,
 'kept': 0.0,
 'fulli': 0.0,
 'inform': 0.0,
 'sale': 0.0,
 'letter': 0.0,
 'john': 0.0,
 'howard': 0.0,
 'deputi': 0.0,
 'mark': 0.0,
 'vail': 0.0002560461978122829,
 'releas': 0.0,
 'cole': 0.0,
 'inquiri': 0.0,
 'oil': 0.0,
 'food': 0.0,
 'program': 0.0,
 'in': 0.0,
 'one': 0.0,
 'mr': 0.0,
 'manag': 0.0,
 'director': 0.0,
 'andrew': 0.0,
 'lindberg': 6.401154945307072e-05,
 'remain': 0.0,
 'close': 0.0,
 'contact': 0.0,
 'govern': 0.0,
 'opposit': 0.0,
 'gavan': 3.388846735750803e-05,
 'o': 0.0,
 'connor': 0.0,
 'say': 0.0,
 'sent': 0.0,
 'time': 0.0,
 'though': 0.0,
 'jordanian': 1.5061541047781346e-05,
 'truck': 0.0,
 'compani': 0.0,
 'he': 0.0,
 'longer': 0.0,
 'wipe': 0.0,
 'hand': 0.0,
 'illicit': 0.0,
 'paymen

In [47]:
tfidf_science

{'cystic': 0.0,
 'fibrosi': 0.0,
 'affect': 0.0,
 'children': 0.0,
 'young': 0.0,
 'adult': 0.0,
 'us': 0.0,
 'alon': 0.0,
 'inhal': 0.0,
 'mist': 0.0,
 'salt': 0.0,
 'water': 0.0,
 'reduc': 0.0,
 'pu': 0.0,
 'infect': 0.0,
 'fill': 0.0,
 'airway': 0.0,
 'suffer': 0.0,
 'although': 0.0,
 'side': 0.0,
 'effect': 0.0,
 'includ': 0.0,
 'nasti': 0.0,
 'cough': 0.0,
 'fit': 0.0,
 'harsh': 0.0,
 'tast': 0.0,
 'that': 0.0,
 'conclus': 0.0,
 'two': 0.0,
 'studi': 0.0,
 'publish': 0.0,
 'week': 0.0,
 'issu': 0.0,
 'the': 0.0,
 'new': 0.0,
 'england': 0.0,
 'journal': 0.0,
 'medicin': 0.0,
 'they': 0.0,
 'found': 0.0,
 'content': 0.0,
 'improv': 0.0,
 'lung': 0.0,
 'function': 0.0,
 'case': 0.0,
 'produc': 0.0,
 'less': 0.0,
 'absente': 0.0,
 'school': 0.0,
 'work': 0.0,
 'progress': 0.0,
 'frequent': 0.0,
 'fatal': 0.0,
 'genet': 0.0,
 'diseas': 0.0,
 'mark': 0.0,
 'thicken': 0.0,
 'mucu': 0.0,
 'make': 0.0,
 'harder': 0.0,
 'clear': 0.0,
 'debri': 0.0,
 'bacteria': 0.0,
 'solut': 0.0,
 'realli