#   
<span style="font-family:Arial; font-weight:Bold; font-size:2.3em; color:#00b3e5;"> NLP Project: Text Classification (1)

#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> loading libraries  

In [1]:
import numpy  as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import re, string, unicodedata
from   bs4 import BeautifulSoup

from   textblob  import TextBlob, Word
from   wordcloud import WordCloud, STOPWORDS
import spacy

from sklearn.svm           import SVC
from sklearn.naive_bayes   import MultinomialNB
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model  import LogisticRegression, SGDClassifier
from sklearn               import naive_bayes, ensemble, decomposition
from sklearn.metrics       import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
import nltk   # pip install --upgrade nltk

nltk.download('punkt') 
nltk.download('wordnet')
nltk.download('stopwords') 

from nltk.corpus          import stopwords
from nltk.stem.porter     import PorterStemmer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize        import word_tokenize, sent_tokenize
from nltk.stem            import LancasterStemmer, WordNetLemmatizer

[nltk_data] Downloading package punkt to C:\Users\EZ-
[nltk_data]     Tech\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\EZ-
[nltk_data]     Tech\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\EZ-
[nltk_data]     Tech\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> loading dataset

In [3]:
data = pd.read_csv('bbcText.csv',delimiter=',')
data.sample(3)

Unnamed: 0,category,text
95,sport,dent continues adelaide progress american tayl...
1490,entertainment,farrell due to make us tv debut actor colin fa...
2142,sport,rush future at chester uncertain ian rush s fu...


#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> EDA

In [4]:
data.shape

(2225, 2)

In [5]:
data.describe()

Unnamed: 0,category,text
count,2225,2225
unique,5,2126
top,sport,kennedy questions trust of blair lib dem leade...
freq,511,2


In [6]:
data = data.dropna()
data.isnull().sum()

category    0
text        0
dtype: int64

#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> NLP Preprocessing

In [7]:
# Tokenization of the text
tokenizers = ToktokTokenizer()

In [8]:
# Removing the noisy text
def noiseremoval_text(text):
    soup = BeautifulSoup(text,'html.parser')
    text = soup.get_text()
    text = re.sub( '\[[^]]*\]', '', text )
    return text

In [9]:
# Apply function on review column
data['text'] = data['text'].apply(noiseremoval_text)

In [10]:
data.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [11]:
# Stemming
def stemmer(text):
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [12]:
# Apply function on review column
data['text'] = data['text'].apply(stemmer)

In [13]:
data.head()

Unnamed: 0,category,text
0,tech,tv futur in the hand of viewer with home theat...
1,business,worldcom boss left book alon former worldcom b...
2,sport,tiger wari of farrel gambl leicest say they wi...
3,sport,yead face newcastl in fa cup premiership side ...
4,entertainment,ocean s twelv raid box offic ocean s twelv the...


In [14]:
# Setting English StopWords
stopwords  = set( nltk.corpus.stopwords.words('english') )

In [15]:
# removing the stopwords
def removing_stopwords( text, into_lower_case=False ):
    # tokenization of text
    tokenizers = ToktokTokenizer()
    # setting english stopwords
    tokens = tokenizers.tokenize( text )
    tokens = [i.strip() for i in tokens]
    if into_lower_case:
        filtokens = [i for i in tokens if i.lower() not in stopwords]
    else:
        filtokens = [i for i in tokens if i         not in stopwords]
    filtered_text = ' '.join(filtokens)
    return filtered_text

In [16]:
# Apply function on review column
data['text'] = data['text'].apply(removing_stopwords)

In [17]:
data.sample(5)

Unnamed: 0,category,text
655,entertainment,eminem secret gig venu reveal rapper eminem pl...
1641,sport,tulu appear caledonian run two-tim olymp 10 00...
2128,business,hous price suffer festiv fall uk hous price fe...
1013,sport,sullivan could run world sonia sullivan ha ind...
81,sport,coach ranieri sack valencia claudio ranieri ha...


#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> Train-Valid-Test Split (text)

In [18]:
# Percents
n  = data.shape[0]

Train_text = data.text[          :int(.8*n) ]
Valid_text = data.text[ int(.8*n):int(.9*n) ]
Test_text  = data.text[ int(.9*n):          ]

#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> Label Encoding

In [19]:
# Labeling the sentiment data
label = LabelBinarizer()

In [20]:
# transformed category data
Category_Value = label.fit_transform(data['category'])

In [21]:
print( Category_Value.shape )

(2225, 5)


In [22]:
Category_Value

array([[0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0]])

#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> Train-Valid-Test Split (Sentiment)

In [23]:
# Percents
n  = data.shape[0]

Train_Category = data.category[          :int(.8*n) ]
Valid_Category = data.category[ int(.8*n):int(.9*n) ]
Test_Category  = data.category[ int(.9*n):          ]

In [24]:
Test_Category.sample(3)

2058    entertainment
2094             tech
2019    entertainment
Name: category, dtype: object

#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> Bag of Words

In [25]:
#Count vectorizer for bag of words
cv=CountVectorizer( min_df=0, max_df=1, binary=False, ngram_range=(1,3) )

In [26]:
# transformed Data
CV_Train = cv.fit_transform( Train_text )
CV_Valid = cv.transform(     Valid_text )
CV_Test  = cv.transform(     Test_text  )

In [27]:
print('BOW CV_Train:', CV_Train.shape)
print('BOW CV_Valid:', CV_Valid.shape)
print('BOW CV_Test: ', CV_Test.shape )

BOW CV_Train: (1780, 570256)
BOW CV_Valid: (222, 570256)
BOW CV_Test:  (223, 570256)


#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> TF-IDF

In [28]:
# TF-IDF vectorizer  
tf = TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))

In [29]:
# transformed Data
TF_Train = tf.fit_transform( Train_text )
TF_Valid = tf.transform(     Valid_text )
TF_Test  = tf.transform(     Test_text  )

In [30]:
print('BOW TF_Train:', TF_Train.shape)
print('BOW TF_Valid:', TF_Valid.shape)
print('BOW TF_Test: ', TF_Test.shape )

BOW TF_Train: (1780, 570256)
BOW TF_Valid: (222, 570256)
BOW TF_Test:  (223, 570256)


#####      
<span style="font-family:Arial; font-weight:Bold; font-size:1.8em; color:#00b3e5;"> Logistic Regression Model

In [31]:
model = LogisticRegression( penalty='l2', max_iter=500, C=1, random_state=7 )

<span style="font-family:Arial; font-weight:Bold; font-size:1.5em; color:#00b3e5;"> Model with BOW

In [32]:
# Fitting
BOW_Model = model.fit( CV_Train, Train_Category )
print(BOW_Model)

LogisticRegression(C=1, max_iter=500, random_state=7)


In [33]:
# Predicting
BOW_Pred = BOW_Model.predict( CV_Valid )
BOW_Pred

array(['tech', 'tech', 'tech', 'tech', 'tech', 'tech', 'tech', 'tech',
       'tech', 'tech', 'politics', 'tech', 'tech', 'entertainment',
       'tech', 'tech', 'tech', 'tech', 'sport', 'tech', 'tech', 'tech',
       'tech', 'tech', 'tech', 'tech', 'tech', 'tech', 'tech', 'tech',
       'tech', 'tech', 'business', 'tech', 'tech', 'tech', 'tech', 'tech',
       'tech', 'tech', 'tech', 'tech', 'tech', 'tech', 'politics', 'tech',
       'tech', 'tech', 'tech', 'tech', 'tech', 'tech', 'tech', 'tech',
       'tech', 'tech', 'tech', 'politics', 'tech', 'tech', 'tech', 'tech',
       'tech', 'tech', 'tech', 'business', 'tech', 'tech', 'tech', 'tech',
       'tech', 'politics', 'tech', 'entertainment', 'tech', 'tech',
       'tech', 'tech', 'tech', 'tech', 'tech', 'sport', 'tech', 'tech',
       'tech', 'entertainment', 'tech', 'tech', 'tech', 'tech', 'tech',
       'tech', 'tech', 'tech', 'entertainment', 'tech', 'tech', 'tech',
       'tech', 'tech', 'tech', 'tech', 'business', 'tech', 'tec

In [34]:
# Scoring (Accuracy)  
BOW_Score = accuracy_score( BOW_Pred, Valid_Category )
print("BOW_Score :",BOW_Score)

BOW_Score : 0.25675675675675674


<span style="font-family:Arial; font-weight:Bold; font-size:1.5em; color:#00b3e5;"> Model with TF-IDF

In [35]:
# Fitting
TF_Model = model.fit( TF_Train, Train_Category )
print(TF_Model)

LogisticRegression(C=1, max_iter=500, random_state=7)


In [36]:
# Predicting
TF_Pred = TF_Model.predict( TF_Valid )
TF_Pred

array(['sport', 'sport', 'sport', 'sport', 'sport', 'tech', 'business',
       'sport', 'business', 'business', 'politics', 'sport', 'business',
       'entertainment', 'sport', 'sport', 'business', 'business', 'sport',
       'sport', 'tech', 'business', 'sport', 'sport', 'business',
       'business', 'sport', 'sport', 'sport', 'business', 'business',
       'business', 'business', 'business', 'sport', 'business',
       'business', 'business', 'business', 'sport', 'sport', 'sport',
       'sport', 'business', 'politics', 'sport', 'sport', 'business',
       'tech', 'business', 'tech', 'sport', 'business', 'business',
       'business', 'sport', 'sport', 'politics', 'business', 'sport',
       'sport', 'sport', 'business', 'sport', 'business', 'business',
       'tech', 'politics', 'business', 'sport', 'sport', 'politics',
       'sport', 'entertainment', 'business', 'sport', 'sport', 'business',
       'sport', 'sport', 'sport', 'sport', 'sport', 'entertainment',
       'tech', 'ent

In [37]:
# Scoring (Accuracy)  
TF_Score = accuracy_score( TF_Pred, Valid_Category )
print("TF_Score :",TF_Score)

TF_Score : 0.6531531531531531


#####   

#####   