### Installing necessary packages

- pip install -U gensim

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors # load the Stanford GloVe model

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop = stopwords.words('english')

from textblob import TextBlob
from textblob import Word

import pandas as pd
import numpy as np
import re



### Reading the csv file and droping the Nan values and resetting index values

In [2]:
train = pd.read_csv('Final_HIV_without_Nan.csv')
train.head()

Unnamed: 0,Headlines,Descriptions,Authors,Published_Dates,Publication,Articles,category,Keywords,Summaries,Source_URLs
0,Drug â€˜can greatly reduce risk of HIV infecti...,The largest online news service for Afghanistan,Khaama Press,25-11-2010,The Khaama Press News Agency,A drug used to treat HIV-positive patients may...,{},"['drug', 'reduce', 'infection', 'results', 'us...",A drug used to treat HIV-positive patients may...,https://www.khaama.com/drug-can-greatly-reduce...
1,Afghanistan recorded 1367 HIV/AIDS positive ca...,The largest online news service for Afghanistan,Ahmad Shah Ghanizada,06-07-2013,The Khaama Press News Agency,"At least 1,367 positive cases of Human immunod...",{},"['afghanistan', 'paikan', 'positive', '2011', ...","At least 1,367 positive cases of Human immunod...",https://www.khaama.com/afghanistan-recorded-13...
2,Afghanistan sees 38 percent increase in HIV/AI...,The largest online news service for Afghanistan,Ahmad Shah Ghanizada,01-12-2013,The Khaama Press News Agency,Officials in the ministry of public health of ...,{},"['afghanistan', 'positive', 'sees', 'virus', '...",Officials in the ministry of public health of ...,https://www.khaama.com/afghanistan-sees-38-per...
3,"Over 1,200 people live with HIV in Afghanistan...",The largest online news service for Afghanistan,Sayed Jawad,01-12-2012,The Khaama Press News Agency,"Today marks World AIDS Day, a time for the glo...",{},"['immunodeficiency', '1200', 'aids', 'afghanis...","Today marks World AIDS Day, a time for the glo...",https://www.khaama.com/over-1200-people-live-w...
4,Aishwarya Rai Bachchan named UN HIV/AIDS envoy,The largest online news service for Afghanistan,Sajad,25-09-2012,The Khaama Press News Agency,The United Nations Monday announced the additi...,{},"['infections', 'mother', 'aishwarya', 'rai', '...",The United Nations Monday announced the additi...,https://www.khaama.com/aishwarya-rai-bachchan-...


### Basic feature extraction using text data
- Number of words
- Number of characters
- Average word length
- Number of stopwords
- Number of special characters
- Number of numerics
- Number of uppercase words

### Number of words

In [3]:
train['word_count'] = train['Articles'].apply(lambda x: len(str(x).split(" ")))
train[['Articles','word_count']].head()

Unnamed: 0,Articles,word_count
0,A drug used to treat HIV-positive patients may...,621
1,"At least 1,367 positive cases of Human immunod...",272
2,Officials in the ministry of public health of ...,169
3,"Today marks World AIDS Day, a time for the glo...",221
4,The United Nations Monday announced the additi...,231


### Number of characters

In [4]:
train['char_count'] = train['Articles'].str.len() ## this also includes spaces
train[['Articles','char_count']].head()

Unnamed: 0,Articles,char_count
0,A drug used to treat HIV-positive patients may...,3658
1,"At least 1,367 positive cases of Human immunod...",1695
2,Officials in the ministry of public health of ...,1053
3,"Today marks World AIDS Day, a time for the glo...",1341
4,The United Nations Monday announced the additi...,1450


### Average word length

In [5]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['Articles'].apply(lambda x: avg_word(x))
train[['Articles','avg_word']].head()

Unnamed: 0,Articles,avg_word
0,A drug used to treat HIV-positive patients may...,4.89211
1,"At least 1,367 positive cases of Human immunod...",5.235294
2,Officials in the ministry of public health of ...,5.236686
3,"Today marks World AIDS Day, a time for the glo...",5.072398
4,The United Nations Monday announced the additi...,5.281385


### Number of stopwords

In [6]:
train['stopwords'] = train['Articles'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['Articles','stopwords']].head()

Unnamed: 0,Articles,stopwords
0,A drug used to treat HIV-positive patients may...,246
1,"At least 1,367 positive cases of Human immunod...",99
2,Officials in the ministry of public health of ...,58
3,"Today marks World AIDS Day, a time for the glo...",80
4,The United Nations Monday announced the additi...,82


### Number of special characters

In [7]:
train['special_characters'] = train['Articles'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['Articles','special_characters']].head()

Unnamed: 0,Articles,special_characters
0,A drug used to treat HIV-positive patients may...,0
1,"At least 1,367 positive cases of Human immunod...",0
2,Officials in the ministry of public health of ...,0
3,"Today marks World AIDS Day, a time for the glo...",0
4,The United Nations Monday announced the additi...,0


### Number of numerics

In [8]:
train['numerics'] = train['Articles'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['Articles','numerics']].head()

Unnamed: 0,Articles,numerics
0,A drug used to treat HIV-positive patients may...,0
1,"At least 1,367 positive cases of Human immunod...",1
2,Officials in the ministry of public health of ...,4
3,"Today marks World AIDS Day, a time for the glo...",5
4,The United Nations Monday announced the additi...,8


### Number of uppercase words

In [9]:
train['upper'] = train['Articles'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['Articles','upper']].head()

Unnamed: 0,Articles,upper
0,A drug used to treat HIV-positive patients may...,16
1,"At least 1,367 positive cases of Human immunod...",11
2,Officials in the ministry of public health of ...,8
3,"Today marks World AIDS Day, a time for the glo...",15
4,The United Nations Monday announced the additi...,10


### Basic Text Pre-processing of text data
- Lower casing
- Punctuation removal
- Stopwords removal
- Frequent words removal
- Rare words removal
- Spelling correction
- Tokenization
- Stemming
- Lemmatization

### Lower casing

In [10]:
train['Articles'] = train['Articles'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['Articles'].head()

0    a drug used to treat hiv-positive patients may...
1    at least 1,367 positive cases of human immunod...
2    officials in the ministry of public health of ...
3    today marks world aids day, a time for the glo...
4    the united nations monday announced the additi...
Name: Articles, dtype: object

### Punctuation removal

In [11]:
train['Articles'] = train['Articles'].str.replace('[^\w\s]','')
train['Articles'].head()

0    a drug used to treat hivpositive patients may ...
1    at least 1367 positive cases of human immunode...
2    officials in the ministry of public health of ...
3    today marks world aids day a time for the glob...
4    the united nations monday announced the additi...
Name: Articles, dtype: object

### Stopwords removal

In [12]:
train['Articles'] = train['Articles'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['Articles'].head()

0    drug used treat hivpositive patients may offer...
1    least 1367 positive cases human immunodeficien...
2    officials ministry public health afghanistan i...
3    today marks world aids day time global communi...
4    united nations monday announced addition bolly...
Name: Articles, dtype: object

### Common word removal

In [13]:
freq = pd.Series(' '.join(train['Articles']).split()).value_counts()[:10]
freq

hiv          2914
said         2248
hivaids      1845
people       1698
health       1342
treatment     849
aids          803
nam           696
city          689
also          675
dtype: int64

In [14]:
freq = list(freq.index)
train['Articles'] = train['Articles'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['Articles'].head()

0    drug used treat hivpositive patients may offer...
1    least 1367 positive cases human immunodeficien...
2    officials ministry public afghanistan informed...
3    today marks world day time global community re...
4    united nations monday announced addition bolly...
Name: Articles, dtype: object

### Rare words removal

In [15]:
freq = pd.Series(' '.join(train['Articles']).split()).value_counts()[-10:]
freq

moveâ         1
honda         1
feedback      1
tâs           1
skull         1
vnä7515       1
systemsâ      1
capacities    1
qarizada      1
soul          1
dtype: int64

In [16]:
freq = list(freq.index)
train['Articles'] = train['Articles'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['Articles'].head()

0    drug used treat hivpositive patients may offer...
1    least 1367 positive cases human immunodeficien...
2    officials ministry public afghanistan informed...
3    today marks world day time global community re...
4    united nations monday announced addition bolly...
Name: Articles, dtype: object

### Spelling correction

In [17]:
train['Articles'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    drug used treat hivpositive patients may offer...
1    least 1367 positive cases human immunodeficien...
2    officials ministry public afghanistan informed...
3    today marks world day time global community re...
4    united nations monday announced addition bolly...
Name: Articles, dtype: object

### Tokenization

In [18]:
TextBlob(train['Articles'][1]).words

WordList(['least', '1367', 'positive', 'cases', 'human', 'immunodeficiency', 'virus', 'recorded', 'afghanistan', 'according', 'public', 'ministry', 'afghanistan', 'public', 'officials', 'called', 'latest', 'statistics', 'worrying', 'majority', 'individuals', 'using', 'syringes', 'inject', 'drugs', 'affected', 'virus', 'head', 'human', 'immunodeficiency', 'virus', 'department', 'public', 'ministry', 'afghanistan', 'dr', 'fida', 'mohammad', 'paikan', 'majority', 'positive', 'cases', 'recorded', 'capital', 'kabul', 'lowest', 'number', 'postive', 'cases', 'recorded', 'western', 'badghis', 'province', 'afghanistan', 'dr', 'paikan', 'added', 'public', 'ministry', 'afghanistan', 'cooperation', 'un', 'world', 'organization', 'united', 'nations', 'office', 'drugs', 'crime', 'provide', 'latest', 'statistic', 'positive', 'cases', 'annually', 'least', '1367', 'positive', 'cases', 'recorded', 'end', '2011', 'dr', 'paikan', 'adding', 'number', 'could', 'statistics', 'include', 'individuals', 'volunt

### Stemming

In [19]:
st = PorterStemmer()
train['Articles'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0    drug use treat hivposit patient may offer gay ...
1    least 1367 posit case human immunodefici viru ...
2    offici ministri public afghanistan inform incr...
3    today mark world day time global commun recogn...
4    unit nation monday announc addit bollywood ico...
Name: Articles, dtype: object

### Lemmatization

In [20]:
train['Articles'] = train['Articles'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['Articles'].head()

0    drug used treat hivpositive patient may offer ...
1    least 1367 positive case human immunodeficienc...
2    official ministry public afghanistan informed ...
3    today mark world day time global community rec...
4    united nation monday announced addition bollyw...
Name: Articles, dtype: object

### Advance Text Processing
- N-grams
- Term Frequency
- Inverse Document Frequency
- Term Frequency-Inverse Document Frequency (TF-IDF)
- Bag of Words
- Sentiment Analysis
- Word Embedding

### N-grams

In [21]:
TextBlob(train['Articles'][0]).ngrams(2)

[WordList(['drug', 'used']),
 WordList(['used', 'treat']),
 WordList(['treat', 'hivpositive']),
 WordList(['hivpositive', 'patient']),
 WordList(['patient', 'may']),
 WordList(['may', 'offer']),
 WordList(['offer', 'gay']),
 WordList(['gay', 'bisexual']),
 WordList(['bisexual', 'men']),
 WordList(['men', 'protection']),
 WordList(['protection', 'contracting']),
 WordList(['contracting', 'virus']),
 WordList(['virus', 'author']),
 WordList(['author', 'new']),
 WordList(['new', 'study']),
 WordList(['study', 'say']),
 WordList(['say', 'trial']),
 WordList(['trial', 'combination']),
 WordList(['combination', 'drug']),
 WordList(['drug', 'truvada']),
 WordList(['truvada', 'among']),
 WordList(['among', 'nearly']),
 WordList(['nearly', '2500']),
 WordList(['2500', 'men']),
 WordList(['men', 'suggested']),
 WordList(['suggested', 'could']),
 WordList(['could', 'reduce']),
 WordList(['reduce', 'chance']),
 WordList(['chance', 'maletomale']),
 WordList(['maletomale', 'infection']),
 WordList([

### Term Frequency

In [22]:
tf1 = (train['Articles'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,afghanistan,7
1,virus,6
2,case,6
3,public,5
4,drug,5
5,recorded,5
6,positive,5
7,individual,4
8,dr,4
9,paikan,4


### Inverse Document Frequency

In [23]:
for i,word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['Articles'].str.contains(word)])))
tf1

Unnamed: 0,words,tf,idf
0,afghanistan,7,2.995732
1,virus,6,1.251040
2,case,6,0.951382
3,public,5,1.306782
4,drug,5,0.899196
5,recorded,5,2.556366
6,positive,5,0.987750
7,individual,4,2.866521
8,dr,4,0.361613
9,paikan,4,6.363028


### Term Frequency – Inverse Document Frequency (TF-IDF)

In [24]:
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,afghanistan,7,2.995732,20.970126
1,virus,6,1.251040,7.506242
2,case,6,0.951382,5.708292
3,public,5,1.306782,6.533911
4,drug,5,0.899196,4.495981
5,recorded,5,2.556366,12.781828
6,positive,5,0.987750,4.938748
7,individual,4,2.866521,11.466082
8,dr,4,0.361613,1.446453
9,paikan,4,6.363028,25.452112


In [25]:
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['Articles'])

train_vect

<580x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 52291 stored elements in Compressed Sparse Row format>

### Bag of Words

In [26]:
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['Articles'])
train_bow

<580x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 57246 stored elements in Compressed Sparse Row format>

### Sentiment Analysis

In [27]:
train['Articles'][:5].apply(lambda x: TextBlob(x).sentiment)

0    (0.16425026123301986, 0.47387870749939726)
1    (0.07407822062994475, 0.38189281982385437)
2     (0.05241341991341991, 0.3787445887445887)
3    (-0.05832696715049657, 0.3053093964858671)
4     (0.1706293706293706, 0.40209790209790214)
Name: Articles, dtype: object

In [28]:
train['sentiment'] = train['Articles'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['Articles','sentiment']].head()

Unnamed: 0,Articles,sentiment
0,drug used treat hivpositive patient may offer ...,0.16425
1,least 1367 positive case human immunodeficienc...,0.074078
2,official ministry public afghanistan informed ...,0.052413
3,today mark world day time global community rec...,-0.058327
4,united nation monday announced addition bollyw...,0.170629


### Word Embeddings

In [29]:
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [30]:
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [31]:
model['go']

array([-0.078894,  0.4616  ,  0.57779 , -0.71637 , -0.13121 ,  0.4186  ,
       -0.29156 ,  0.52006 ,  0.089986, -0.35062 ,  0.51755 ,  0.51998 ,
        0.15218 ,  0.41485 , -0.12377 , -0.37222 ,  0.0273  ,  0.75673 ,
       -0.8739  ,  0.58935 ,  0.46662 ,  0.62918 ,  0.092603, -0.012868,
       -0.015169,  0.25567 , -0.43025 , -0.77668 ,  0.71449 , -0.3834  ,
       -0.69638 ,  0.23522 ,  0.11396 ,  0.02778 ,  0.071357,  0.87409 ,
       -0.1281  ,  0.063576,  0.067867, -0.50181 , -0.28523 , -0.072536,
       -0.50738 , -0.6914  , -0.53579 , -0.11361 , -0.38234 , -0.12414 ,
        0.011214, -1.1622  ,  0.037057, -0.18495 ,  0.01416 ,  0.87193 ,
       -0.097309, -2.3565  , -0.14554 ,  0.28275 ,  2.0053  ,  0.23439 ,
       -0.38298 ,  0.69539 , -0.44916 , -0.094157,  0.90527 ,  0.65764 ,
        0.27628 ,  0.30688 , -0.57781 , -0.22987 , -0.083043, -0.57236 ,
       -0.299   , -0.81112 ,  0.039752, -0.05681 , -0.48879 , -0.18091 ,
       -0.28152 , -0.20559 ,  0.4932  , -0.033999, 

In [32]:
model['away']

array([-0.10379 , -0.014792,  0.59933 , -0.51316 , -0.036463,  0.6588  ,
       -0.57906 ,  0.17819 ,  0.23663 , -0.21384 ,  0.55339 ,  0.53597 ,
        0.041444,  0.16095 ,  0.017093, -0.37242 ,  0.017974,  0.39268 ,
       -0.23265 ,  0.1818  ,  0.66405 ,  0.98163 ,  0.42339 ,  0.030581,
        0.35015 ,  0.25519 , -0.71182 , -0.42184 ,  0.13068 , -0.47452 ,
       -0.08175 ,  0.1574  , -0.13262 ,  0.22679 , -0.16885 , -0.11122 ,
       -0.32272 , -0.020978, -0.43345 ,  0.172   , -0.67366 , -0.79052 ,
        0.10556 , -0.4219  , -0.12385 , -0.063486, -0.17843 ,  0.56359 ,
        0.16986 , -0.17804 ,  0.13956 , -0.20169 ,  0.078985,  1.4497  ,
        0.23556 , -2.6014  , -0.5286  , -0.11636 ,  1.7184  ,  0.33254 ,
        0.12136 ,  1.1602  , -0.2914  ,  0.47125 ,  0.41869 ,  0.35271 ,
        0.47869 , -0.042281, -0.18294 ,  0.1796  , -0.24431 , -0.34042 ,
        0.20337 , -0.93676 ,  0.013077,  0.080339, -0.36604 , -0.44005 ,
       -0.35393 ,  0.15907 ,  0.55807 ,  0.1492  , 

In [33]:
(model['go'] + model['away'])/2

array([-0.091342  ,  0.223404  ,  0.58856   , -0.614765  , -0.0838365 ,
        0.5387    , -0.43531   ,  0.349125  ,  0.163308  , -0.28223   ,
        0.53547   ,  0.52797496,  0.096812  ,  0.2879    , -0.0533385 ,
       -0.37232   ,  0.022637  ,  0.574705  , -0.553275  ,  0.385575  ,
        0.565335  ,  0.805405  ,  0.2579965 ,  0.0088565 ,  0.1674905 ,
        0.25543   , -0.571035  , -0.59926   ,  0.422585  , -0.42896   ,
       -0.389065  ,  0.19631   , -0.00933   ,  0.127285  , -0.0487465 ,
        0.381435  , -0.22540998,  0.021299  , -0.1827915 , -0.16490501,
       -0.47944498, -0.431528  , -0.20091   , -0.55665   , -0.32982   ,
       -0.088548  , -0.28038502,  0.219725  ,  0.090537  , -0.67012   ,
        0.0883085 , -0.19332   ,  0.0465725 ,  1.160815  ,  0.0691255 ,
       -2.47895   , -0.33707   ,  0.083195  ,  1.86185   ,  0.283465  ,
       -0.13081   ,  0.927795  , -0.37028   ,  0.1885465 ,  0.66198   ,
        0.505175  ,  0.37748498,  0.1322995 , -0.380375  , -0.02