In [20]:
import pandas as pd
import re
import pickle
import joblib

import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
data = data[:1000]
data.describe(exclude=None)


Unnamed: 0,id,label
count,1000.0,1000.0
mean,499.5,0.501
std,288.819436,0.500249
min,0.0,0.0
25%,249.75,0.0
50%,499.5,1.0
75%,749.25,1.0
max,999.0,1.0


In [5]:
def preprocess_data(sentence):
    #print(sentence)
    sentence = sentence.lower()
    tokens = sentence.split(' ')
    sentence = list()
    for word in tokens:
        word = re.sub("\W","",word)
        sentence.append(word)
    
    
    sentence = [word for word in sentence if word not in stop_words]
    sentence = [lemmatizer.lemmatize(word) for word in sentence]
    
    
    return ' '.join(sentence)
    

In [6]:
data = data.dropna()

In [7]:
data['preprocessed_text'] = data['text'].apply(preprocess_data)

In [8]:
data.head()

Unnamed: 0,id,title,author,text,label,preprocessed_text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide didnt even see comeys letter ja...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,ever get feeling life circle roundabout rather...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,truth might get fired october 29 2016 tension ...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,video 15 civilian killed single u airstrike id...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,print iranian woman sentenced six year prison ...


In [9]:
cv = CountVectorizer()
X = cv.fit_transform(data['preprocessed_text'])

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, data['label'], test_size=0.20, random_state=42)

In [11]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
y_pred = clf.predict(X_val)

In [13]:
print(confusion_matrix(y_val, y_pred))

[[93  4]
 [22 55]]


In [14]:
print(accuracy_score(y_val, y_pred))

0.8505747126436781


In [15]:
with open('pickled_model.sav', 'wb') as f:
    pickle.dump(clf, f)

In [16]:
with open('pickled_model.sav', 'rb') as f:
    model = pickle.load(f)

In [21]:
with open('joblib_model.sav', 'wb') as f:
    joblib.dump(clf, f)

In [22]:
with open('joblib_model.sav', 'rb') as f:
    model_joblib = joblib.load(f)

In [23]:
df = pd.read_csv('train.csv')
X_test, y_test = cv.transform([preprocess_data(df['text'][2000])]), df['label'][2000]

In [24]:
y_pred = model_joblib.predict(X_test)

In [25]:
y_pred

array([0])

In [26]:
with open('count_vectorizer.pickle', 'wb') as cv_file:
    pickle.dump(cv, cv_file)

In [27]:
with open('count_vectorizer.pickle', 'rb') as cv_file:
    vectorizer = pickle.load(cv_file)

In [28]:
X_test, y_test = cv.transform([preprocess_data(df['text'][3000])]), df['label'][3000]
X_test

<1x35707 sparse matrix of type '<class 'numpy.int64'>'
	with 118 stored elements in Compressed Sparse Row format>