In [164]:
import gensim
import keras
from keras import utils
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.layers import Dense, Dropout, Embedding, LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
import multiprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
import numpy as np
import os
from operator import getitem
import pandas as pd
import re
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [165]:
stopwords_set = set(stopwords.words('english'))
required = {"needn", "doesn", "won", "shan't", "don", "ain", "not", "didn", "hadn", "haven't", "couldn't", "wasn't", "aren't", "isn", "needn't", "aren", "wouldn", "shouldn", "hasn't", "shan", "no", "wasn", "nor", "hasn", "mightn", "doesn't", "against", "wouldn't", "couldn", "hadn't", "isn't", "mustn", "don't", "weren't", "haven", "mustn't", "shouldn't", "weren", "won't", "mightn't"}
stopwords_set -= required
print(stopwords_set)
dictionary = set(nltk.corpus.words.words())

{'yours', 'should', 'while', 'such', 'doing', 'yourself', 'can', 'that', 'she', 'was', 'than', 'its', 'during', 'there', 'them', 't', 'ma', 'his', 'a', 'these', 'herself', 'other', 'again', 'about', 'which', 'each', 'once', "you'll", 'is', 'our', 'and', 'under', 'ours', 'most', 'some', 'whom', 'themselves', 'why', 'were', 'it', 'you', 'when', 'out', 'own', 'what', 'to', 'too', 'same', 'those', 'of', 'my', 'few', 'further', 'himself', 'hers', 'having', 'any', 'just', 'do', 'him', 'so', 'over', 'where', 'be', 'both', "that'll", 'above', 'had', "it's", 'did', 'will', 'being', 'before', 'your', 'm', 'y', 'only', 'now', 'off', 'an', 'from', 's', 'has', 'here', 'for', 'with', 'down', 'in', 'more', 'because', 'have', 'if', 'ourselves', 'all', 'the', "you'd", 'after', 'between', 'd', 'll', 'we', 'are', 'but', 'i', 'who', 'o', 'until', 'me', 'theirs', 'through', 'below', 've', 'this', 'myself', 'yourselves', 'how', 'he', 'am', "should've", "you've", "didn't", 'into', 'her', "she's", 'their', 'a

In [166]:
df = pd.read_csv("FinalTweetList_train.csv", header = None)

In [167]:
#downsample displeasure and miscellaneous to equalise classes


compliment = df[df[1]=='compliment']
displeasure = df[df[1]=='displeasure']
miscellaneous = df[df[1]=='miscellaneous']
_, displeasure = train_test_split(displeasure, test_size = len(compliment), random_state = 21)
_, miscellaneous = train_test_split(miscellaneous, test_size = len(compliment), random_state = 21)
df = compliment.append([displeasure, miscellaneous])
df = df.reset_index(drop = True)
print(df)

#stratified downsample

                                                      0              1
0     @TheOfficialSBI  @Indiapnb @IDBI_Bank @AxisBan...     compliment
1     First hand experience of the #CashRush process...     compliment
2     @ksmkkbookscom @HDFC_Bank yeah @AxisBank @Axis...     compliment
3     @narendramodi much of the issues will be sorte...     compliment
4     @AxisBankSupport Thank u for ur response right...     compliment
...                                                 ...            ...
4135  #Bankdeposits will spike: How Rs 500, Rs 1000 ...  miscellaneous
4136  @ICICIBank_Care got a call from +919821294665 ...  miscellaneous
4137  @keshri_niraj @AxisBankSupport Hi, could you p...  miscellaneous
4138  Nw banker will undrstnd wht a softwaredevolope...  miscellaneous
4139  @TheOfficialSBI @SBICard_Connect I have blocke...  miscellaneous

[4140 rows x 2 columns]


In [168]:
def processTweet(tweet):
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    #Convert to lower case
    tweet = tweet.lower()
    return tweet

for i in range(len(df[0])):
    df[0][i] = processTweet(df[0][i])

print(df)

                                                      0              1
0     at_user at_user at_user at_user at_user at_use...     compliment
1     first hand experience of the cashrush process ...     compliment
2     at_user at_user yeah at_user at_user r the mos...     compliment
3     at_user much of the issues will be sorted if b...     compliment
4     at_user thank u for ur response right now. thi...     compliment
...                                                 ...            ...
4135  bankdeposits will spike: how rs 500, rs 1000 n...  miscellaneous
4136  at_user got a call from +919821294665 some ici...  miscellaneous
4137  at_user at_user hi, could you please elaborate...  miscellaneous
4138  nw banker will undrstnd wht a softwaredevolope...  miscellaneous
4139  at_user at_user i have blocked my debit card f...  miscellaneous

[4140 rows x 2 columns]


In [169]:
train = df

def word_clean(words):
    #return ' '.join([word for word in words if (word.isalpha())])
    #return ' '.join([word for word in words if (word.isalpha() and word in dictionary)])
    return ' '.join([word for word in words if (word.isalpha() and word not in stopwords_set)])
    #return ' '.join([word for word in words if (word.isalpha() and word not in stopwords_set and word in dictionary)])
    
def clean_tweets(dataframe):
    return [word_clean(WordPunctTokenizer().tokenize(dataframe[0][i])) for i in range(len(dataframe[0]))]

x_train = clean_tweets(train)
y_train = train[1]
print(len(y_train))

4140


In [170]:
x_train = [gensim.utils.simple_preprocess(text) for text in x_train]
#Continous bag of words used
w2v_model = gensim.models.Word2Vec(min_count=1, window=5,
                                        size=100,
                                        workers=multiprocessing.cpu_count())
w2v_model.build_vocab(x_train)
w2v_model.train(x_train, total_examples=w2v_model.corpus_count, epochs=100)
print("Word2Vec trained")

x_train = [' '.join(i) for i in x_train]

Word2Vec trained


In [171]:
def word_feats(words):
    return {tuple(w2v_model.wv[word].tolist()): True for word in words}    #downsample 88.5% train 67.4% test    upsample 80.5% train 74% test
    
def extract_feats(xdataframe, ydataframe):
    return [(word_feats(WordPunctTokenizer().tokenize(xdataframe[i])), ydataframe[i]) for i in range(len(xdataframe))]

train_set = extract_feats(x_train, y_train)

In [None]:
#Maxent train

algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = nltk.MaxentClassifier.train(train_set, algorithm, max_iter = 25)
classifier.show_most_informative_features(10)

  ==> Training (25 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.333
             2          -1.09783        0.869
             3          -1.09705        0.869
             4          -1.09626        0.869
             5          -1.09548        0.869
             6          -1.09470        0.869
             7          -1.09392        0.869
             8          -1.09314        0.869
             9          -1.09237        0.869
            10          -1.09159        0.869
            11          -1.09081        0.869
            12          -1.09004        0.869
            13          -1.08926        0.869
            14          -1.08849        0.869
            15          -1.08772        0.869
            16          -1.08695        0.870
            17          -1.08617        0.870
            18          -1.08540        0.870
            19          -1.08463        0.870
  

In [None]:
#Maxent testing on all banks and performance index calculation from classifier predictions

values = {
    "compliment": 1,
    "miscellaneous": 0.5,
    "displeasure": 0
}

results = {}

direc = os.fsencode('./test')
for File in os.listdir(direc):
    filename = os.fsdecode(File)
    try:
        test = pd.read_csv("./test/" + filename, header = None)
        for i in range(len(test[0])):
            test[0][i] = processTweet(test[0][i])
            
        x_test = clean_tweets(test)
        y_test = test[1]

        test_set = extract_feats(x_test, y_test)
        y_true = y_test
        y_pred = [classifier.classify(i[0]) for i in test_set]
        
        sum = 0
        for i in y_pred:
            sum += values[i]
        result = {}
        result['performance_index'] = sum * 100 / len(y_pred)
        result['classification_report'] = classification_report(y_true, y_pred)
        result['tweets'] = len(y_pred)
        results[filename[:-4]] = result
    except:
        pass

for i in sorted(results.items(), key = lambda x: ( -(getitem(x[1],'tweets')), -(getitem(x[1],'performance_index')) )):
    print((i[0] + ": ").rjust(40) + ("{:.2f}".format(i[1]['performance_index'])).rjust(6) + "% in " + str(i[1]['tweets']).rjust(4) + " tweets")
    print(i[1]['classification_report'])

In [None]:
#i give up