In [11]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

In [12]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [13]:
df = pd.read_csv("ra_data_classifier.csv", encoding= 'unicode_escape')

In [24]:
df['chunk'] = df['chunk'].apply(lambda x: re.sub(r"[^a-zA-Z0-9%\d\/\$\s]+", "", x))
df['chunk']

0                                Landmark Center 8th Fl
1     Contact The C3 team at MakemeC3cicus  Addition...
2     A powerful tool for developers the MySQL Datab...
3                   Easy access to T Hubway and parking
4                         Check out our Private Offices
                            ...                        
95                      Cambridge Coworking CommunityC3
96    Vibrant neighborhood community  restaurants sh...
97    Rates Geek Premium dedicated workspace for $47...
98    The reshaping of the American workforce initia...
99    Usermin is webmail and much more Usermin provi...
Name: chunk, Length: 100, dtype: object

In [15]:
df['chunk'].isnull().sum()

0

In [16]:
dataX = df['chunk'].values.tolist()

### Convert document into word vector 
* stop words are removed 
* lemmanize the words 
* Get the token
* find TF-IDF 

In [19]:
vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(), stop_words='english')

In [22]:
X = vectorizer.fit_transform(dataX)

In [94]:
data = X.toarray()

In [95]:
data.shape
type(data)

numpy.ndarray

In [96]:
df.columns

Index(['hid', 'chunk', 'has_space'], dtype='object')

In [97]:
labels = df['has_space'].values

In [98]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size = 0.2)

In [82]:
sum(y_test)

5

In [83]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

### validating model using cross validation cv=5 


In [110]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [116]:
naive_bayes = MultinomialNB()
scoring = ['precision', 'recall', 'accuracy']

In [117]:
score = cross_validate(naive_bayes, data, labels, cv=5, scoring=scoring)

In [119]:
score

{'fit_time': array([0.00099826, 0.        , 0.        , 0.        , 0.00099802]),
 'score_time': array([0.00099707, 0.00099778, 0.0009973 , 0.00199461, 0.00100255]),
 'test_precision': array([1.  , 1.  , 0.75, 1.  , 1.  ]),
 'test_recall': array([0.71428571, 0.57142857, 0.42857143, 0.71428571, 0.57142857]),
 'test_accuracy': array([0.9 , 0.85, 0.75, 0.9 , 0.85])}

In [121]:
print("Precision", sum(score['test_precision']/5))
print("Recall", sum(score['test_recall'])/5)
print("Accuracy", sum(score['test_accuracy'])/5)

Precision 0.95
Recall 0.6
Accuracy 0.85
