### Installing necessary packages

- pip install -U gensim

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors # load the Stanford GloVe model

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop = stopwords.words('english')

from textblob import TextBlob
from textblob import Word

import pandas as pd
import numpy as np
import re



### Reading the csv file and droping the Nan values and resetting index values

In [2]:
train = pd.read_csv('Final_AA_without_Nan.csv')
train.head()

Unnamed: 0,Headlines,Descriptions,Authors,Published_Dates,Publication,Articles,category,Keywords,Summaries,Source_URLs
0,BOOZE RUINS LIVES!,â€œTHE wake-up call for me should have been th...,Jabu Kumalo,2018-08-15T14:30:02.000Z,DailySun,â€œTHE wake-up call for me should have been th...,{},"['aa', 'booze', 'hit', 'lives', 'decided', 'in...",He told Daily Sun he sometimes couldnâ€™t beli...,https://www.dailysun.co.za/News/National/booze...
1,"ALCOHOLICS, IT'S NEVER TOO LATE!",IF YOU have a drinking problem or you know som...,Jabu Kumalo,2017-10-11T15:30:03.000Z,DailySun,IF YOU have a drinking problem or you know som...,{},"['west', 'alcoholics', 'visit', 'thought', 'la...",IF YOU have a drinking problem or you know som...,https://www.dailysun.co.za/News/National/alcoh...
2,NO BOOZE FOR THESE TWO!,SUNDAY was a joyful day as a husband and wife ...,Jabu Kumalo,2018-10-03T18:00:10.000Z,DailySun,SUNDAY was a joyful day as a husband and wife ...,{},"['anonymous', 'times', 'alcoholics', 'booze', ...",SUNDAY was a joyful day as a husband and wife ...,https://www.dailysun.co.za/News/National/no-bo...
3,A SEASON TO SOBER UP!,MZANSI is known as a nation of boozers.,Sifiso Jimta,2018-09-05T11:30:02.000Z,DailySun,MZANSI is known as a nation of boozers.To give...,{},"['season', 'janet', 'soberspringchallenge', 'i...",The organisation has launched the #SoberSpring...,https://www.dailysun.co.za/News/National/a-sea...
4,NAIR MAY GO FOR MENTAL HEALTH CHECK!,HE called President Cyril Ramaphosa the k-word...,Sun Reporter,2018-10-02T09:34:03.000Z,DailySun,HE called President Cyril Ramaphosa the k-word...,{},"['surgeon', 'district', 'nairs', 'fort', 'napi...",Govender argued that Nair should be treated as...,https://www.dailysun.co.za/News/National/nair-...


### Basic feature extraction using text data
- Number of words
- Number of characters
- Average word length
- Number of stopwords
- Number of special characters
- Number of numerics
- Number of uppercase words

### Number of words

In [3]:
train['word_count'] = train['Articles'].apply(lambda x: len(str(x).split(" ")))
train[['Articles','word_count']].head()

Unnamed: 0,Articles,word_count
0,â€œTHE wake-up call for me should have been th...,342
1,IF YOU have a drinking problem or you know som...,208
2,SUNDAY was a joyful day as a husband and wife ...,206
3,MZANSI is known as a nation of boozers.To give...,231
4,HE called President Cyril Ramaphosa the k-word...,398


### Number of characters

In [4]:
train['char_count'] = train['Articles'].str.len() ## this also includes spaces
train[['Articles','char_count']].head()

Unnamed: 0,Articles,char_count
0,â€œTHE wake-up call for me should have been th...,1817
1,IF YOU have a drinking problem or you know som...,1144
2,SUNDAY was a joyful day as a husband and wife ...,1188
3,MZANSI is known as a nation of boozers.To give...,1426
4,HE called President Cyril Ramaphosa the k-word...,2599


### Average word length

In [5]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['Articles'].apply(lambda x: avg_word(x))
train[['Articles','avg_word']].head()

Unnamed: 0,Articles,avg_word
0,â€œTHE wake-up call for me should have been th...,4.315789
1,IF YOU have a drinking problem or you know som...,4.504808
2,SUNDAY was a joyful day as a husband and wife ...,4.771845
3,MZANSI is known as a nation of boozers.To give...,5.177489
4,HE called President Cyril Ramaphosa the k-word...,5.532663


### Number of stopwords

In [6]:
train['stopwords'] = train['Articles'].apply(lambda x: len([x for x in x.split() if x in stop]))
train[['Articles','stopwords']].head()

Unnamed: 0,Articles,stopwords
0,â€œTHE wake-up call for me should have been th...,144
1,IF YOU have a drinking problem or you know som...,91
2,SUNDAY was a joyful day as a husband and wife ...,78
3,MZANSI is known as a nation of boozers.To give...,82
4,HE called President Cyril Ramaphosa the k-word...,175


### Number of special characters

In [7]:
train['special_characters'] = train['Articles'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train[['Articles','special_characters']].head()

Unnamed: 0,Articles,special_characters
0,â€œTHE wake-up call for me should have been th...,0
1,IF YOU have a drinking problem or you know som...,0
2,SUNDAY was a joyful day as a husband and wife ...,0
3,MZANSI is known as a nation of boozers.To give...,3
4,HE called President Cyril Ramaphosa the k-word...,0


### Number of numerics

In [8]:
train['numerics'] = train['Articles'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train[['Articles','numerics']].head()

Unnamed: 0,Articles,numerics
0,â€œTHE wake-up call for me should have been th...,2
1,IF YOU have a drinking problem or you know som...,4
2,SUNDAY was a joyful day as a husband and wife ...,7
3,MZANSI is known as a nation of boozers.To give...,1
4,HE called President Cyril Ramaphosa the k-word...,2


### Number of uppercase words

In [9]:
train['upper'] = train['Articles'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train[['Articles','upper']].head()

Unnamed: 0,Articles,upper
0,â€œTHE wake-up call for me should have been th...,12
1,IF YOU have a drinking problem or you know som...,9
2,SUNDAY was a joyful day as a husband and wife ...,1
3,MZANSI is known as a nation of boozers.To give...,6
4,HE called President Cyril Ramaphosa the k-word...,1


### Basic Text Pre-processing of text data
- Lower casing
- Punctuation removal
- Stopwords removal
- Frequent words removal
- Rare words removal
- Spelling correction
- Tokenization
- Stemming
- Lemmatization

### Lower casing

In [10]:
train['Articles'] = train['Articles'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train['Articles'].head()

0    â€œthe wake-up call for me should have been th...
1    if you have a drinking problem or you know som...
2    sunday was a joyful day as a husband and wife ...
3    mzansi is known as a nation of boozers.to give...
4    he called president cyril ramaphosa the k-word...
Name: Articles, dtype: object

### Punctuation removal

In [11]:
train['Articles'] = train['Articles'].str.replace('[^\w\s]','')
train['Articles'].head()

0    âœthe wakeup call for me should have been the ...
1    if you have a drinking problem or you know som...
2    sunday was a joyful day as a husband and wife ...
3    mzansi is known as a nation of boozersto give ...
4    he called president cyril ramaphosa the kword ...
Name: Articles, dtype: object

### Stopwords removal

In [12]:
train['Articles'] = train['Articles'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['Articles'].head()

0    âœthe wakeup call day bullet blew hat offâ wor...
1    drinking problem know somebody uncle tomâs hal...
2    sunday joyful day husband wife soweto shared s...
3    mzansi known nation boozersto give boozing day...
4    called president cyril ramaphosa kword widely ...
Name: Articles, dtype: object

### Common word removal

In [13]:
freq = pd.Series(' '.join(train['Articles']).split()).value_counts()[:10]
freq

said          110
alcohol        85
â              82
one            80
aa             69
anonymous      68
alcoholics     66
drinking       62
years          58
would          55
dtype: int64

In [14]:
freq = list(freq.index)
train['Articles'] = train['Articles'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['Articles'].head()

0    âœthe wakeup call day bullet blew hat offâ wor...
1    problem know somebody uncle tomâs hall orlando...
2    sunday joyful day husband wife soweto shared s...
3    mzansi known nation boozersto give boozing day...
4    called president cyril ramaphosa kword widely ...
Name: Articles, dtype: object

### Rare words removal

In [15]:
freq = pd.Series(' '.join(train['Articles']).split()).value_counts()[-10:]
freq

stems             1
holierthanthou    1
âcasio            1
azhar             1
simon             1
memories          1
fakes             1
cameron           1
usingâ            1
âkiller           1
dtype: int64

In [16]:
freq = list(freq.index)
train['Articles'] = train['Articles'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
train['Articles'].head()

0    âœthe wakeup call day bullet blew hat offâ wor...
1    problem know somebody uncle tomâs hall orlando...
2    sunday joyful day husband wife soweto shared s...
3    mzansi known nation boozersto give boozing day...
4    called president cyril ramaphosa kword widely ...
Name: Articles, dtype: object

### Spelling correction

In [17]:
train['Articles'][:5].apply(lambda x: str(TextBlob(x).correct()))

0    the wake call day bullet blew hat off words mu...
1    problem know somebody uncle tombs hall orlando...
2    sunday joyful day husband wife sowed shared st...
3    means known nation boozersto give oozing days ...
4    called president cyril ramaphosa word widely c...
Name: Articles, dtype: object

### Tokenization

In [18]:
TextBlob(train['Articles'][1]).words

WordList(['problem', 'know', 'somebody', 'uncle', 'tomâs', 'hall', 'orlando', 'west', 'soweto', 'best', 'place', 'saturdaythe', 'soweto', 'group', 'holding', '35th', 'annual', 'rally', 'members', 'community', 'invited', 'hear', 'pulled', 'pit', 'uncontrollable', 'imbizo', 'starts', '9am', 'aims', 'reach', 'people', 'booze', 'problema', 'member', 'group', 'imbizo', 'highlight', 'year', 'sober', '20', 'yearshe', 'told', 'peopleâs', 'paper', 'âœi', 'never', 'missed', 'imbizo', 'since', 'stopped', 'joined', 'group', 'time', 'thought', 'everything', 'finished', 'thought', 'life', 'reached', 'dead', 'end', 'things', 'bad', 'even', 'attempted', 'suicide', 'three', 'times', 'see', 'successfulâœtoday', 'always', 'enthusiastic', 'ready', 'face', 'lifeâhe', 'never', 'knew', 'problem', 'doctor', 'suggested', 'visit', 'support', 'group', 'emphasised', 'never', 'late', 'stopentrance', 'imbizo', 'freethe', 'group', 'reached', '0861', '435', '722', 'nationally'])

### Stemming

In [19]:
st = PorterStemmer()
train['Articles'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0    âœthe wakeup call day bullet blew hat offâ wor...
1    problem know somebodi uncl tomâ hall orlando w...
2    sunday joy day husband wife soweto share stori...
3    mzansi known nation boozersto give booz day le...
4    call presid cyril ramaphosa kword wide circul ...
Name: Articles, dtype: object

### Lemmatization

In [20]:
train['Articles'] = train['Articles'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['Articles'].head()

0    âœthe wakeup call day bullet blew hat offâ wor...
1    problem know somebody uncle tomâs hall orlando...
2    sunday joyful day husband wife soweto shared s...
3    mzansi known nation boozersto give boozing day...
4    called president cyril ramaphosa kword widely ...
Name: Articles, dtype: object

### Advance Text Processing
- N-grams
- Term Frequency
- Inverse Document Frequency
- Term Frequency-Inverse Document Frequency (TF-IDF)
- Bag of Words
- Sentiment Analysis
- Word Embedding

### N-grams

In [21]:
TextBlob(train['Articles'][0]).ngrams(2)

[WordList(['âœthe', 'wakeup']),
 WordList(['wakeup', 'call']),
 WordList(['call', 'day']),
 WordList(['day', 'bullet']),
 WordList(['bullet', 'blew']),
 WordList(['blew', 'hat']),
 WordList(['hat', 'offâ']),
 WordList(['offâ', 'word']),
 WordList(['word', 'mussolini']),
 WordList(['mussolini', 'shumi']),
 WordList(['shumi', 'managed']),
 WordList(['managed', 'stop']),
 WordList(['stop', 'thanks']),
 WordList(['thanks', 'told']),
 WordList(['told', 'daily']),
 WordList(['daily', 'sun']),
 WordList(['sun', 'sometimes']),
 WordList(['sometimes', 'couldnât']),
 WordList(['couldnât', 'believe']),
 WordList(['believe', 'still']),
 WordList(['still', 'alive']),
 WordList(['alive', 'criminal']),
 WordList(['criminal', 'activity']),
 WordList(['activity', 'involved']),
 WordList(['involved', 'speaking']),
 WordList(['speaking', 'day']),
 WordList(['day', 'almost']),
 WordList(['almost', 'lost']),
 WordList(['lost', 'life']),
 WordList(['life', 'mussolini']),
 WordList(['mussolini', 'âœwe']),
 W

### Term Frequency

In [22]:
tf1 = (train['Articles'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,group,5
1,imbizo,4
2,never,3
3,member,2
4,time,2
5,soweto,2
6,problem,2
7,thought,2
8,reached,2
9,place,1


### Inverse Document Frequency

In [23]:
for i,word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['Articles'].str.contains(word)])))
tf1

Unnamed: 0,words,tf,idf
0,group,5,1.044545
1,imbizo,4,2.890372
2,never,3,1.424035
3,member,2,0.897942
4,time,2,0.378066
5,soweto,2,3.295837
6,problem,2,1.044545
7,thought,2,1.791759
8,reached,2,2.379546
9,place,1,1.504077


### Term Frequency – Inverse Document Frequency (TF-IDF)

In [24]:
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,group,5,1.044545,5.222725
1,imbizo,4,2.890372,11.561487
2,never,3,1.424035,4.272104
3,member,2,0.897942,1.795883
4,time,2,0.378066,0.756132
5,soweto,2,3.295837,6.591674
6,problem,2,1.044545,2.089090
7,thought,2,1.791759,3.583519
8,reached,2,2.379546,4.759092
9,place,1,1.504077,1.504077


In [25]:
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['Articles'])

train_vect

<54x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 4980 stored elements in Compressed Sparse Row format>

### Bag of Words

In [26]:
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['Articles'])
train_bow

<54x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 5646 stored elements in Compressed Sparse Row format>

### Sentiment Analysis

In [27]:
train['Articles'][:5].apply(lambda x: TextBlob(x).sentiment)

0     (0.06317460317460317, 0.3844444444444445)
1      (0.1642857142857143, 0.5095238095238096)
2      (0.23636363636363633, 0.303030303030303)
3     (0.13974358974358975, 0.5596153846153846)
4    (-0.04444444444444443, 0.3074074074074075)
Name: Articles, dtype: object

In [28]:
train['sentiment'] = train['Articles'].apply(lambda x: TextBlob(x).sentiment[0] )
train[['Articles','sentiment']].head()

Unnamed: 0,Articles,sentiment
0,âœthe wakeup call day bullet blew hat offâ wor...,0.063175
1,problem know somebody uncle tomâs hall orlando...,0.164286
2,sunday joyful day husband wife soweto shared s...,0.236364
3,mzansi known nation boozersto give boozing day...,0.139744
4,called president cyril ramaphosa kword widely ...,-0.044444


### Word Embeddings

In [29]:
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [30]:
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [31]:
model['go']

array([-0.078894,  0.4616  ,  0.57779 , -0.71637 , -0.13121 ,  0.4186  ,
       -0.29156 ,  0.52006 ,  0.089986, -0.35062 ,  0.51755 ,  0.51998 ,
        0.15218 ,  0.41485 , -0.12377 , -0.37222 ,  0.0273  ,  0.75673 ,
       -0.8739  ,  0.58935 ,  0.46662 ,  0.62918 ,  0.092603, -0.012868,
       -0.015169,  0.25567 , -0.43025 , -0.77668 ,  0.71449 , -0.3834  ,
       -0.69638 ,  0.23522 ,  0.11396 ,  0.02778 ,  0.071357,  0.87409 ,
       -0.1281  ,  0.063576,  0.067867, -0.50181 , -0.28523 , -0.072536,
       -0.50738 , -0.6914  , -0.53579 , -0.11361 , -0.38234 , -0.12414 ,
        0.011214, -1.1622  ,  0.037057, -0.18495 ,  0.01416 ,  0.87193 ,
       -0.097309, -2.3565  , -0.14554 ,  0.28275 ,  2.0053  ,  0.23439 ,
       -0.38298 ,  0.69539 , -0.44916 , -0.094157,  0.90527 ,  0.65764 ,
        0.27628 ,  0.30688 , -0.57781 , -0.22987 , -0.083043, -0.57236 ,
       -0.299   , -0.81112 ,  0.039752, -0.05681 , -0.48879 , -0.18091 ,
       -0.28152 , -0.20559 ,  0.4932  , -0.033999, 

In [32]:
model['away']

array([-0.10379 , -0.014792,  0.59933 , -0.51316 , -0.036463,  0.6588  ,
       -0.57906 ,  0.17819 ,  0.23663 , -0.21384 ,  0.55339 ,  0.53597 ,
        0.041444,  0.16095 ,  0.017093, -0.37242 ,  0.017974,  0.39268 ,
       -0.23265 ,  0.1818  ,  0.66405 ,  0.98163 ,  0.42339 ,  0.030581,
        0.35015 ,  0.25519 , -0.71182 , -0.42184 ,  0.13068 , -0.47452 ,
       -0.08175 ,  0.1574  , -0.13262 ,  0.22679 , -0.16885 , -0.11122 ,
       -0.32272 , -0.020978, -0.43345 ,  0.172   , -0.67366 , -0.79052 ,
        0.10556 , -0.4219  , -0.12385 , -0.063486, -0.17843 ,  0.56359 ,
        0.16986 , -0.17804 ,  0.13956 , -0.20169 ,  0.078985,  1.4497  ,
        0.23556 , -2.6014  , -0.5286  , -0.11636 ,  1.7184  ,  0.33254 ,
        0.12136 ,  1.1602  , -0.2914  ,  0.47125 ,  0.41869 ,  0.35271 ,
        0.47869 , -0.042281, -0.18294 ,  0.1796  , -0.24431 , -0.34042 ,
        0.20337 , -0.93676 ,  0.013077,  0.080339, -0.36604 , -0.44005 ,
       -0.35393 ,  0.15907 ,  0.55807 ,  0.1492  , 

In [33]:
(model['go'] + model['away'])/2

array([-0.091342  ,  0.223404  ,  0.58856   , -0.614765  , -0.0838365 ,
        0.5387    , -0.43531   ,  0.349125  ,  0.163308  , -0.28223   ,
        0.53547   ,  0.52797496,  0.096812  ,  0.2879    , -0.0533385 ,
       -0.37232   ,  0.022637  ,  0.574705  , -0.553275  ,  0.385575  ,
        0.565335  ,  0.805405  ,  0.2579965 ,  0.0088565 ,  0.1674905 ,
        0.25543   , -0.571035  , -0.59926   ,  0.422585  , -0.42896   ,
       -0.389065  ,  0.19631   , -0.00933   ,  0.127285  , -0.0487465 ,
        0.381435  , -0.22540998,  0.021299  , -0.1827915 , -0.16490501,
       -0.47944498, -0.431528  , -0.20091   , -0.55665   , -0.32982   ,
       -0.088548  , -0.28038502,  0.219725  ,  0.090537  , -0.67012   ,
        0.0883085 , -0.19332   ,  0.0465725 ,  1.160815  ,  0.0691255 ,
       -2.47895   , -0.33707   ,  0.083195  ,  1.86185   ,  0.283465  ,
       -0.13081   ,  0.927795  , -0.37028   ,  0.1885465 ,  0.66198   ,
        0.505175  ,  0.37748498,  0.1322995 , -0.380375  , -0.02