In [1]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
import sys, csv, nltk, re, string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus   import stopwords

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.model_selection import cross_val_score

from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [2]:
data = pd.read_csv("D:/Msc/Semester 3/ML/DataFilesV1/train.txt", sep="\t")
data_test = pd.read_csv("D:/Msc/Semester 3/ML/DataFilesV1/test.txt", sep=",")

In [36]:
data_test.head()

Unnamed: 0,Index,Text
0,1,@Callisto1947 Can U Help?||More conservatives ...
1,2,"Just walked in to #Starbucks and asked for a ""..."
2,3,#NOT GONNA WIN http://t.co/Mc9ebqjAqj
3,4,@mickymantell He is exactly that sort of perso...
4,5,So much #sarcasm at work mate 10/10 #boring 10...


In [18]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ians\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [5]:
data.count()

Tweet index    3817
Label          3817
Tweet text     3817
dtype: int64

In [3]:
sw            = stopwords.words('english')
lemma         = WordNetLemmatizer()
common_words  = []

In [4]:
def clean_and_tokanize(line, lemmatize = False):
    line   = re.sub(r'\$\w*', '', line)  # Remove tickers
    line   = re.sub(r'http?:.*$', '', line)
    line   = re.sub(r'https?:.*$', '', line)
    line   = re.sub(r'pic?.*\/\w*', '', line)
    line   = re.sub(r'[' + string.punctuation + ']+', ' ', line)  # Remove puncutations like 's
    
    tokens = TweetTokenizer(strip_handles=True, reduce_len=True).tokenize(line)
    tokens = [w.lower() for w in tokens if w not in sw and len(w) > 2 and w.isalpha()]
    
    if lemmatize:
        tokens = [lemma.lemmatize(word) for word in tokens]
    return tokens
    

In [5]:
tweets = data['Tweet text']
labels = data['Label']

In [19]:
cleaned_tweets = []
cleaned_tweets2 = []
for tweet in tweets:
    t = clean_and_tokanize(tweet, True)
    cleaned_tweets.append(t)
    cleaned_tweets2.append(" ".join(t))
    #print (t)

In [20]:
cleaned_tweets_test = []
for tweet in data_test['Text']:
    t = clean_and_tokanize(tweet, True)
    cleaned_tweets_test.append(" ".join(t))

In [21]:
corpus = []
for tweet in cleaned_tweets:
    for word in tweet:
        corpus.append(word)

In [22]:
#frequency_dist = nltk.FreqDist(corpus)
#sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)[0:50]
#frequency_dist.plot(10,cumulative=False)



In [23]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(cleaned_tweets2, data['Label'], test_size=0.1)

print("Train set: ", len(train_x), "   Test set: ", len(valid_x))

Train set:  3435    Test set:  382


In [24]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
#count_vect.fit(data['Tweet text'])
count_vect.fit(cleaned_tweets2 + cleaned_tweets_test)

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

xtest_count =  count_vect.transform(cleaned_tweets_test)


In [25]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(cleaned_tweets2 + cleaned_tweets_test)
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(cleaned_tweets2 + cleaned_tweets_test)
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(cleaned_tweets2 + cleaned_tweets_test)
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x)

xvalid_tfidf_ngram_chars_test =  tfidf_vect_ngram_chars.transform(cleaned_tweets_test)

In [61]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer(ngram_range=(1,3), min_df=2)
tfidf_transformer = TfidfTransformer()

X_train_counts = count_vect.fit_transform(cleaned_tweets2)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_test_counts = count_vect.transform(cleaned_tweets_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

print(X_train_tfidf.shape, X_test_tfidf.shape)


(3817, 4400) (784, 4400)


In [64]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(X_train_tfidf, data['Label'], test_size=0.0)

In [37]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, 
                is_neural_net=False, ep = 10, b_size = 10):
    # fit the training dataset on the classifier
    
    if is_neural_net:
        classifier.fit(feature_vector_train, label, epochs=ep, batch_size=b_size)
    else:
        classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)  
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y), classifier

In [39]:
accuracy, clf = train_model(naive_bayes.MultinomialNB(), train_x, train_y, valid_x)
print ("NB, Count Vectors: ", accuracy)

NB, Count Vectors:  0.7068062827225131


In [40]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy, clf = train_model(naive_bayes.MultinomialNB(), train_x, train_y, valid_x)
print ("NB, WordLevel TF-IDF: ", accuracy)

NB, WordLevel TF-IDF:  0.7068062827225131


In [41]:
# Naive Bayes on Ngram Level TF IDF Vectors
accuracy, clf = train_model(naive_bayes.MultinomialNB(), train_x, train_y, valid_x)
print ("NB, N-Gram Vectors: ", accuracy)

NB, N-Gram Vectors:  0.7068062827225131


In [80]:
# Naive Bayes on Character Level TF IDF Vectors
accuracy, clf = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: ", accuracy)

NB, CharLevel Vectors:  0.7089005235602094


In [45]:
accuracy, clf = train_model(svm.SVC(C=1.0, kernel='linear', gamma= 2.1), train_x, train_y, valid_x)
print ("SVM, CharLevel Vectors: ", accuracy)

SVM, CharLevel Vectors:  0.8089005235602095


In [43]:
# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), train_x, train_y, valid_x)
print ("RF, WordLevel TF-IDF: ", accuracy)



RF, WordLevel TF-IDF:  (0.7774869109947644, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))


In [44]:
accuracy = train_model(ensemble.RandomForestClassifier(), train_x, train_y, valid_x)
print ("RF, Count Vectors: ", accuracy)



RF, Count Vectors:  (0.806282722513089, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))


In [117]:
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("RF, Count Vectors: ", accuracy)

RF, Count Vectors:  (0.7643979057591623, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))




In [119]:
classifier = naive_bayes.MultinomialNB(alpha=2.0)
#classifier.fit(xtrain_count, train_y)
classifier.fit(xtrain_tfidf_ngram_chars, train_y)

# predict the labels on validation dataset
predictions = classifier.predict(xvalid_tfidf_ngram_chars_test)

In [65]:
classifier = svm.SVC(C=1.0, kernel='linear', gamma= 2.1)
classifier.fit(train_x, train_y)

# predict the labels on validation dataset
predictions = classifier.predict(X_test_tfidf)

In [66]:
df = pd.DataFrame(data_test['Index'], columns=["Index"])
df['Label'] =predictions
df.to_csv('D:/Msc/Semester 3/ML/DataFilesV1/submit.csv', index=False)



RF, Count Vectors:  (0.768586387434555, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))




RF, WordLevel TF-IDF:  (0.7905759162303665, RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))


In [180]:
def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(50, activation="relu")(input_layer)
    hidden_layer2 = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="softmax")(hidden_layer2)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier

In [181]:
classifier = create_model_architecture(xtrain_tfidf.shape[1])

#classifier.fit(xtrain_tfidf_ngram, train_y, epochs=10, batch_size=32)

accuracy = train_model(classifier, xtrain_tfidf, train_y, xvalid_tfidf, is_neural_net=True, ep=10, b_size=10)
print ("NN, Ngram Level TF IDF Vectors",  accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
NN, Ngram Level TF IDF Vectors (0.5225130890052356, <keras.engine.training.Model object at 0x000001D672D8DF28>)


In [161]:
print(xtrain_tfidf_ngram.shape)
from keras.models import Sequential

model = Sequential()

from keras.layers import Dense

model.add(Dense(60, activation='relu', input_shape=(2862, 5000)))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

model.fit(xtrain_tfidf_ngram, train_y, ep=5, b_size=32)
loss_and_metrics = model.evaluate(xvalid_tfidf_ngram, valid_y, batch_size=128)

(2862, 5000)


TypeError: Unrecognized keyword arguments: {'ep': 5, 'b_size': 32}