In [1]:
import pandas as pd
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import os
from sklearn.externals import joblib 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV



In [2]:
current_path = os.getcwd()

In [3]:
class sentiment_analysis:
    def data_preproc(self):
        
        #Loading the data
        data = open('/Users/seethu-8363/Downloads/amazon_reviews.csv').read()
        
        labels, texts = [], []
        for i, line in enumerate(data.split("\n")):
            content = line.split()
            labels.append(content[0])
            texts.append(" ".join(content[1:]))
        # create a dataframe using texts and lables
        trainDF = pd.DataFrame()
        trainDF['text'] = texts
        trainDF['label'] = labels
        
        #lower case
        trainDF["text"] = trainDF["text"].apply(lambda x: " ".join(x.lower() for x in x.split()))
        trainDF["text"].head()
        
        #STOP WORDS
        #Removal of stop words
        stop = stopwords.words('english')
        ##Creating a list of custom stopwords
        new_words = ["using", "show", "result", "large", "also", "iv", "one", "two", "new", "previously", "shown","way","ever",
             "us","up","cd","that","all","second"]
        stop.append(new_words)
        trainDF["text"] = trainDF["text"].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
        
        
        #Removal of punctautaion 
        trainDF["text"] = trainDF["text"].str.replace('[^\w\s]','  ')

        
        # split the dataset into training and validation datasets 
        train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'],test_size = 0.8)
        
        # label encode the target variable 
        encoder = preprocessing.LabelEncoder()
        train_y = encoder.fit_transform(train_y)
        valid_y = encoder.fit_transform(valid_y)
        
    
        return train_x, train_y, valid_x, valid_y
        
    def train_model_pipeline(self, classifier, train_x, label , valid_x , valid_y):
            # fit the training dataset on the classifier
            
            tfidf_vectorizer = TfidfVectorizer()
            nb = classifier
            #creating pipeline for model mand vectoriser
            tfidf_nv_pipe = Pipeline([('tfidf', tfidf_vectorizer), ('nb', nb)])

            #Fitting the data
            tfidf_nv_pipe.fit(train_x,label)
    
            # predict the labels on validation dataset
            predictions = tfidf_nv_pipe.predict(valid_x)
    
            return metrics.f1_score(predictions, valid_y) , tfidf_nv_pipe
    
    def load_model(self, train_x , train_y , valid_x , valid_y):
        
        # Naive Bayes Bernoulli on Word Level TF IDF Vectors
        accuracy,model = self.train_model_pipeline(naive_bayes.BernoulliNB(), train_x, train_y, valid_x , valid_y)
     
        #pickle model
        joblib.dump(model,current_path+"/model/sentiment_analysis_naive_bayes_model.pkl")
        print("NB, WordLevel TF-IDF ACCURACY: ", accuracy)
        
        return model,accuracy
    
    def load_model1(self, train_x , train_y , valid_x , valid_y):
        
        # Naive Bayes Multinomial on Word Level TF IDF Vectors
        accuracy,model = self.train_model_pipeline(naive_bayes.MultinomialNB(), train_x.toarray(), train_y.toarray(), valid_x.toarray() , valid_y.toarray())
     
        #pickle model
        joblib.dump(model,current_path+"/model/sentiment_analysis_naive_bayes_model.pkl")
        print("NB, WordLevel TF-IDF ACCURACY: ", accuracy)
        
        return model,accuracy
    
    def load_model2(self, train_x , train_y , valid_x , valid_y):
        
        # Naive Bayes Gaussian on Word Level TF IDF Vectors
        accuracy,model = self.train_model_pipeline(naive_bayes.GaussianNB(), train_x, train_y, valid_x , valid_y)
        
        return accuracy
    
    def load_model3(self, train_x , train_y , valid_x , valid_y):
        
        # Linear SVM on Word Level TF IDF Vectors
        accuracy,model = self.train_model_pipeline(svm.LinearSVC(), train_x, train_y, valid_x , valid_y)
        
        return accuracy
    
    def load_model4(self, train_x , train_y , valid_x , valid_y):
        
        # Linear SVM on Word Level TF IDF Vectors
        accuracy,model = self.train_model_pipeline(svm.LinearSVC(), train_x, train_y, valid_x , valid_y)
        
        return accuracy
    
    
    def prediction(self,input_str):
            train_x , train_y , valid_x , valid_y = self.data_preproc()
            model,accuracy = self.load_model(train_x , train_y , valid_x , valid_y)
            print("NB, WordLevel TF-IDF ACCURACY: ", accuracy)
            
            predictions = model.predict([input_str])
            
            print("prediction:",predictions)
            #loaded_model = joblib.load("/Users/seethu-8363/Documents/Test/virenv1/model/sentiment_analysis_naive_bayes_model.pkl")

    def prediction1(self,input_str):
            train_x , train_y , valid_x , valid_y = self.data_preproc()
            accuracy = self.load_model1(train_x , train_y , valid_x , valid_y)
            print("NB, WordLevel TF-IDF ACCURACY: ", accuracy)
            #loaded_modl = joblib.load("/Users/seethu-8363/Documents/Test/virenv1/model/sentiment_analysis_naive_bayes_model.pkl")

            
    def prediction2(self,input_str):
            train_x , train_y , valid_x , valid_y = self.data_preproc()
            accuracy = self.load_model2(train_x , train_y , valid_x , valid_y)
            print("NB, WordLevel TF-IDF ACCURACY: ", accuracy)
            #loaded_model = joblib.load("/Users/seethu-8363/Documents/Test/virenv1/model/sentiment_analysis_naive_bayes_model.pkl")
            
            
            
    def prediction2(self,input_str):
            train_x , train_y , valid_x , valid_y = self.data_preproc()
            accuracy = self.load_model2(train_x , train_y , valid_x , valid_y)
            print("NB, WordLevel TF-IDF ACCURACY: ", accuracy)
            #loaded_model = joblib.load("/Users/seethu-8363/Documents/Test/virenv1/model/sentiment_analysis_naive_bayes_model.pkl")

        
    def prediction3(self,input_str):
            train_x , train_y , valid_x , valid_y = self.data_preproc()
            accuracy = self.load_model3(train_x , train_y , valid_x , valid_y)
            print("NB, WordLevel TF-IDF ACCURACY: ", accuracy)
            #loaded_model = joblib.load("/Users/seethu-8363/Documents/Test/virenv1/model/sentiment_analysis_naive_bayes_model.pkl")



In [21]:
#Bernoulli
obj = sentiment_analysis()
obj.prediction('good')

NB, WordLevel TF-IDF ACCURACY:  0.7881413293623933
NB, WordLevel TF-IDF ACCURACY:  0.7881413293623933
prediction: [0]


In [53]:
#Multinomail
obj = naive_bayes_sentiment()
obj.prediction1('good')

NB, WordLevel TF-IDF ACCURACY:  0.7868162692847125
NB, WordLevel TF-IDF ACCURACY:  (Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_

In [46]:
#Gaussian
obj = naive_bayes_sentiment()
obj.prediction2('good')

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [7]:
#SVC
obj = naive_bayes_sentiment()
obj.prediction3('good')

NB, WordLevel TF-IDF ACCURACY:  0.8190211205261161
