In [1]:
import sqlite3
import pandas as pd
import spacy
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC,SVC

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from keras.callbacks import EarlyStopping, ModelCheckpoint
import datetime as dt
import numpy as np
import keras
from keras.models import Sequential 
from keras.preprocessing import sequence
from keras.initializers import he_normal
from keras.layers import BatchNormalization, Dense, Dropout, Flatten, LSTM
from keras.layers.embeddings import Embedding
from keras.regularizers import L1L2
from sklearn.preprocessing import LabelBinarizer
nlp = spacy.load('en_core_web_sm')
nlp.Defaults.stop_words.add("virginamerica")
nlp.Defaults.stop_words.add("united")
nlp.Defaults.stop_words.add("unite")
nlp.Defaults.stop_words.add("delta")
nlp.Defaults.stop_words.add("southwest")
nlp.Defaults.stop_words.add("american")
nlp.Defaults.stop_words.add("us airways")
nlp.Defaults.stop_words.add("indigoairline")
nlp.Defaults.stop_words.add("indigo")
nlp.Defaults.stop_words.add("flight")
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

punctuations = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~'
def preprocess(tweet):
    rev=re.sub('[^a-zA-Z]',' ',tweet)
   
    rev=rev.lower();
    doc = nlp(rev, disable=['parser', 'ner'])
    tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
    tokens = [tok for tok in tokens if tok not in nlp.Defaults.stop_words and tok not in punctuations]
    tokens = ' '.join(tokens)
    twe=emoji_pattern.sub(r'', tokens)
   
    
    return twe
   
    
    
    

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import pickle
def training(x,y):
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=35)
    pl = Pipeline([('tfidf',TfidfVectorizer()),('clf',SVC(kernel='linear',probability=True))])
    pl.fit(x_train,y_train)
    predicts = pl.predict(x_test)
    print(confusion_matrix(y_test,predicts))
    print(classification_report(y_test,predicts))
    print("accuracy::",accuracy_score(y_test,predicts))
    with open('model_svc.pkl','wb') as f:
        pickle.dump(pl,f)
    return pl,accuracy_score(y_test,predicts)

def data_gen(data):
    y=data['airline_sentiment']
    le = LabelEncoder()
    y=le.fit_transform(y)
    pickle.dump(le,open('label_svm.pkl', 'wb'))
    data.text = data.text.apply(lambda x: preprocess(x))
    print("number of classes::",len(list(le.classes_)))
    nclasses=len(list(le.classes_))
    x=data.text
    return x,y,nclasses

In [3]:
def predict(x):
    x=preprocess(x)
    with open('model_svc.pkl', 'rb') as f:
        model = pickle.load(f)
        return model.predict([x])


In [4]:
def main():
    data = pd.read_csv("Tweets.csv")
    print("done")
    x,y,nclasses=data_gen(data)
    print("done")
    svm_model,accuracy_svm=training(x,y)
    print("done")
    lstm_model,accuracy_lstm=lstm_training(x,data['airline_sentiment'])
    
    if(accuracy_lstm>=accuracy_svm):
        with open('best_model/model.pkl','wb') as f:
            pickle.dump(lstm_model,f)
            print("LSTM")
    else:
        with open('best_model/model.pkl','wb') as f:
            pickle.dump(svm_model,f)
            print("SVM")
            
        
        
    
    


In [5]:
def find_word_index(row,word_index_dict):  
    holder = []
    for word in row.split():
        if word in word_index_dict:
            holder.append(word_index_dict[word]) 
        else:
            holder.append(0)            
#     return holder
# def lstm_training(x,y):
#     total_words = []

#     for sent in x:
#         words = sent.split()
#         total_words+=words
#     from collections import Counter
#     counter = Counter(total_words)
#     top_words_count = int(len(counter)/0.95)
#     sorted_words = counter.most_common(top_words_count)

#     word_index_dict = dict()
#     i = 1
#     for word,frequency in sorted_words:
#         word_index_dict[word] = i
#         i += 1
#     pickle.dump(word_index_dict,open('word_index_dict.pkl', 'wb'))
#     text=[]
#     for t in x:
#         text.append(find_word_index(t,word_index_dict))
#     label_binarizer = LabelBinarizer()
#     labels = label_binarizer.fit_transform(y)
#     pickle.dump(label_binarizer,open('label_lstm.pkl', 'wb'))
#     n_classes = len(label_binarizer.classes_)
#     x_train,x_test,y_train,y_test = train_test_split(text,labels,test_size=0.1,shuffle=True,random_state=35)
#     m=0
#     for ind in text:
#         i=len(ind)
#         m=max(m,i)
#     max_review_length = m

#     x_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
#     x_test = sequence.pad_sequences(x_test, maxlen=max_review_length)
#     print("nclasses:",n_classes)
#     print("max length:",m)
#     vocab_size = len(counter.most_common()) + 1
#     model = Sequential()

# # Add Embedding Layer
#     model.add(Embedding(vocab_size, 32, input_length=max_review_length))

# # Add batch normalization
#     model.add(BatchNormalization())

# # Add dropout
#     model.add(Dropout(0.20))

# # Add LSTM Layer
#     model.add(LSTM(128,return_sequences=True))

#     model.add(LSTM(64))

# # Add dropout
#     model.add(Dropout(0.20))

# # Add Dense Layer
#     model.add(Dense(3, activation='softmax'))

# # Summary of the model
#     print("Model Summary: \n")
#     print(model.summary())



#     callbacks = [ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1,restore_best_weights=True)
    
#     results = model.fit(x_train, np.array(y_train), batch_size = 32, epochs = 1, verbose=2, validation_data=(x_test, y_test),callbacks=[early_stop])
#     test_scores = model.evaluate(x_test,y_test,verbose=1)
#     accuracy=test_scores[1]
#     predicts = model.predict(x_test)
    
# #     print("accuracy::",accuracy_score(y_test,predicts))
#     with open('model_lstm.pkl','wb') as f:
#         pickle.dump(model,f)
    
#     return model,accuracy
    

In [6]:
def find_word_index(row,word_index_dict):  
        holder = []
        for word in row.split():
            if word in word_index_dict:
                holder.append(word_index_dict[word]) 
            else:
                holder.append(0)            
        return holder



def lstm_training(x,y):
        total_words = []

        for sent in x:
            words = sent.split()
            total_words+=words
        from collections import Counter
        counter = Counter(total_words)
        top_words_count = int(len(counter)/0.95)
        sorted_words = counter.most_common(top_words_count)

        word_index_dict = dict()
        i = 1
        for word,frequency in sorted_words:
            word_index_dict[word] = i
            i += 1
        text=[]
        for t in x:
            text.append(find_word_index(t,word_index_dict))
        label_binarizer = LabelBinarizer()
        labels = label_binarizer.fit_transform(y)
        n_classes = len(label_binarizer.classes_)
        pickle.dump(label_binarizer,open('label_lstm.pkl', 'wb'))
        pickle.dump(word_index_dict,open('word_index_dict.pkl', 'wb'))
        x_train,x_test,y_train,y_test = train_test_split(text,labels,test_size=0.1,shuffle=True,random_state=35)
        m=0
        for ind in text:
            i=len(ind)
            m=max(m,i)
        max_review_length = m

        x_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
        x_test = sequence.pad_sequences(x_test, maxlen=max_review_length)
        print("nclasses:",n_classes)
        print("max length:",m)
        vocab_size = len(counter.most_common()) + 1
        model = Sequential()

    # Add Embedding Layer
        model.add(Embedding(vocab_size, 32, input_length=max_review_length))

    # Add batch normalization
        model.add(BatchNormalization())

    # Add dropout
        model.add(Dropout(0.20))

    # Add LSTM Layer
        model.add(LSTM(128,return_sequences=True))

        model.add(LSTM(64))

    # Add dropout
        model.add(Dropout(0.20))

    # Add Dense Layer
        model.add(Dense(3, activation='softmax'))

    # Summary of the model
        print("Model Summary: \n")
        print(model.summary())



        callbacks = [ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1,restore_best_weights=True)
        
        results = model.fit(x_train, np.array(y_train), batch_size = 32, epochs = 1, verbose=2, validation_data=(x_test, y_test),callbacks=[early_stop])
        test_scores = model.evaluate(x_test,y_test,verbose=1)
        accuracy=test_scores[1]
        predicts = model.predict(x_test)
        
    #     print("accuracy::",accuracy_score(y_test,predicts))
        with open('model_lstm.pkl','wb') as f:
            pickle.dump(model,f)
        
        return model,accuracy

In [35]:
predict("good")

array([2])

In [36]:
predict("bad")

array([0])

In [37]:
predict("hlo man how youre doing")

array([2])

In [7]:
main()

done


KeyboardInterrupt: 

In [47]:
def predict(x):
    out1,acc_svc=predict_svm(x)
    out2,acc_lstm=predict_lstm(x)
    if acc_svc>acc_lstm:
        print(out1)
    else:
        print(out2)

In [8]:

def predict_svm(x):
    x=preprocess(x)
    with open('model_svc.pkl', 'rb') as f:
        model = pickle.load(f)
#     print(model.predict[x])
    with open('label_svm.pkl', 'rb') as f:
        encoder= pickle.load(f)
    
  

    
    
    
    
    
    
    pred=model.predict([x])
    
    pred=encoder.inverse_transform(pred)
    print("output::",pred)
        
    return pred,max(model.predict_proba([x])[0])
    

In [55]:
predict_svm("good")

output:: ['positive']


(array(['positive'], dtype=object), 0.9928379904602651)

In [53]:
w

In [54]:
predict_lstm("good")

[[0.14753886 0.23056807 0.6218931 ]]
['negative' 'neutral' 'positive']
output:: ['positive']


(array(['positive'], dtype='<U8'), 0.6218931)

In [11]:
from clean import preprocess
import pickle
import sqlite3
import pandas as pd
import spacy
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from keras.callbacks import EarlyStopping, ModelCheckpoint
import datetime as dt
import numpy as np
import keras
from keras.models import Sequential 
from keras.preprocessing import sequence
from keras.initializers import he_normal
from keras.layers import BatchNormalization, Dense, Dropout, Flatten, LSTM
from keras.layers.embeddings import Embedding
from keras.regularizers import L1L2
from sklearn.preprocessing import LabelBinarizer
import warnings
def predict(x):
    out1,acc_svc=predict_svm(x)
    out2,acc_lstm=predict_lstm(x)
    if acc_svc>acc_lstm:
        print(out1)
    else:
        print(out2)



def predict_svm(x):
    x=preprocess(x)
    with open('model_svc.pkl', 'rb') as f:
        model = pickle.load(f)
#     print(model.predict[x])
    with open('label_svm.pkl', 'rb') as f:
        encoder= pickle.load(f)
    
  

    
    
    
    
    
    
    pred=model.predict([x])
    
    pred=encoder.inverse_transform(pred)
    print("output::",pred)
        
    return pred,max(model.predict_proba([x])[0])




def find_word_index(row,word_index_dict):  
    holder = []
    for word in row.split():
        if word in word_index_dict:
            holder.append(word_index_dict[word]) 
        else:
            holder.append(0)            
    return holder
def predict_lstm(x):
    x=preprocess(x)
    with open('model_lstm.pkl', 'rb') as f:
        model = pickle.load(f)
#     print(model.predict[x])
    with open('label_lstm.pkl', 'rb') as f:
        encoder= pickle.load(f)
        
    with open('word_index_dict.pkl', 'rb') as f:
        word_index_dict= pickle.load(f)
        
        
    x= find_word_index(x,word_index_dict)
        
    x=sequence.pad_sequences([x], maxlen=24)
    
    
        
    
    
    
    
    pred_prob=model.predict(x)
    
    print(pred_prob)
    print(encoder.classes_)
    
    pred=encoder.inverse_transform(pred_prob)
    print("output::",pred)
        
    return pred,max(pred_prob[0])

In [13]:
predict_svm("i was very frustrated with the service @indigo")

output:: ['negative']


(array(['negative'], dtype=object), 0.9529391252458301)