In [1]:
import csv
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import numpy as np
import nltk
from nltk.tokenize import word_tokenize

In [60]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [49]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import string

In [18]:
def load(name):
    with open(name) as f:
        read = csv.reader(f, delimiter='\t')
        next(read)
        for line in read:
            (ID, Text, Label) = convert1(line)
            rawData.append((ID, Text, Label))
            preprocessedData.append((ID, preProcess(Text), Label))

In [4]:
def splitting(ratio):
    samples=len(rawData)
    half= int(len(rawData)/2)
    trainingSamples = int((ratio*samples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[half:half+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:half] + rawData[half+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [6]:
def convert1(reviewLine):
    s=""
    if reviewLine[1]=="__label1__":
        s = "fake"
    else: 
        s = "real"
    return (reviewLine[0], reviewLine[8], s)

In [51]:
table=str.maketrans({key: None for key in string.punctuation})
def preProcess(text):
    lemmatizer = WordNetLemmatizer()
    filtered_tokens=[]
    lemmatized_tokens = []
    stop_words = set(stopwords.words('english'))
    text = text.translate(table)
    for w in text.split(" "):
        if w not in stop_words:
            lemmatized_tokens.append(lemmatizer.lemmatize(w.lower()))
        filtered_tokens = [' '.join(l) for l in nltk.bigrams(lemmatized_tokens)] + lemmatized_tokens
    return filtered_tokens

In [77]:
featureDict = {}
def toFeatureVector(tokens):
    localDict = {}
    for token in tokens:
        if token not in featureDict:
            featureDict[token] = 1
        else:
            featureDict[token] = +1
   
        if token not in localDict:
            localDict[token] = 1
        else:
            localDict[token] = +1
    return localDict

In [67]:
def trainClassifier(trainData):
    pipeline =  Pipeline([('svc', LinearSVC(C=0.01))])
    return SklearnClassifier(pipeline).train(trainData)

In [68]:
def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    for i in range(0,len(dataset),foldSize):
        classifier = trainClassifier(dataset[:i]+dataset[foldSize+i:])
        y_pred = predictLabels(dataset[i:i+foldSize],classifier)
        a = accuracy_score(list(map(lambda d : d[1], dataset[i:i+foldSize])), y_pred)
        (p,r,f,_) = precision_recall_fscore_support(list(map(lambda d : d[1], dataset[i:i+foldSize])), y_pred, average ='macro')
        cv_results.append((a,p,r,f))
    cv_results = (np.mean(np.array(cv_results),axis=0))
    return cv_results

In [69]:
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [70]:
rawData = []          
preprocessedData = [] 
trainData = []      
testData = [] 
fakeLabel='fake'
realLabel='real'

In [90]:
name='amazon_reviews1.txt'

In [91]:
load(name)

In [92]:
splitting(0.8)

In [93]:
from sklearn.svm import LinearSVC

In [94]:
trainData

[({'i received': 1,
   'received battery': 1,
   'battery day': 1,
   'day charged': 1,
   'charged i': 1,
   'i tested': 2,
   'tested flashlight': 1,
   'flashlight hold': 1,
   'hold charge': 1,
   'charge expected': 1,
   'expected i': 1,
   'tested millimeter': 1,
   'millimeter charge': 1,
   'charge quality': 1,
   'quality i': 1,
   'i hoped': 1,
   'hoped i': 1,
   'i glad': 1,
   'glad i': 1,
   'i purchased': 1,
   'purchased battery': 1,
   'battery purchase': 1,
   'purchase soon': 1,
   'i': 6,
   'received': 1,
   'battery': 2,
   'day': 1,
   'charged': 1,
   'tested': 2,
   'flashlight': 1,
   'hold': 1,
   'charge': 2,
   'expected': 1,
   'millimeter': 1,
   'quality': 1,
   'hoped': 1,
   'glad': 1,
   'purchased': 1,
   'purchase': 1,
   'soon': 1},
  'fake'),
 ({'it seems': 1,
   'seems holding': 1,
   'holding really': 1,
   'really well': 1,
   'well husband': 1,
   'husband boot': 1,
   'boot ': 1,
   ' it': 1,
   'it make': 1,
   'make look': 1,
   'look like'

In [95]:
print(crossValidate(trainData,10))

[0.84762434 0.84339546 0.76092619 0.78478932]


In [98]:
print(crossValidate(testData,10))

[0.76135057 0.76096499 0.72104962 0.72982577]


In [99]:
classifier=trainClassifierlassifier(trainData)

In [100]:
classifier

<SklearnClassifier(Pipeline(memory=None,
         steps=[('svc',
                 LinearSVC(C=0.01, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
                           loss='squared_hinge', max_iter=1000,
                           multi_class='ovr', penalty='l2', random_state=None,
                           tol=0.0001, verbose=0))],
         verbose=False))>

In [101]:
import pickle

In [104]:
pickle.dump(classifier,open('classifier.sav','wb'))