### Bag of words model

In [48]:
# load all necessary libraries
import gensim
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import pandas, xgboost, numpy, textblob, string
#from keras.preprocessing import text, sequence
#from keras import layers, models, optimizers


#### Let's build a basic bag of words model on three sample documents

In [2]:
documents = ["Gangs of Wasseypur is a great movie.", "The success of a movie depends on the performance of the actors.", "There are no new movies releasing this week."]
#documents = pd.read_csv(r"C:\Masters\Masters in Data Science\Research\EDA\Data\Revised Data\FinalData\Google_Cloud\2HVSingleTextClassification.txt", sep = "\t", names=["label", "message"])
print(documents)

['Gangs of Wasseypur is a great movie.', 'The success of a movie depends on the performance of the actors.', 'There are no new movies releasing this week.']


In [3]:
def preprocess(document):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]

    # join words to make sentence
    document = " ".join(words)
    
    return document

documents = [preprocess(document) for document in documents]
print(documents)


NameError: name 'word_tokenize' is not defined

#### Creating bag of words model using count vectorizer function

In [4]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(documents)
print(bow_model)  # returns the row number and column number of the cells which have 1 as value

  (0, 4)	1
  (0, 3)	1
  (0, 10)	1
  (0, 2)	1
  (1, 0)	1
  (1, 7)	1
  (1, 1)	1
  (1, 9)	1
  (1, 4)	1
  (2, 11)	1
  (2, 8)	1
  (2, 5)	1
  (2, 6)	1


In [5]:
# print the full sparse matrix
print(bow_model.toarray())

[[0 0 1 1 1 0 0 0 0 0 1 0]
 [1 1 0 0 1 0 0 1 0 1 0 0]
 [0 0 0 0 0 1 1 0 1 0 0 1]]


In [6]:
print(bow_model.shape)
print(vectorizer.get_feature_names())

(3, 12)
['actors', 'depends', 'gangs', 'great', 'movie', 'movies', 'new', 'performance', 'releasing', 'success', 'wasseypur', 'week']


### Let's create a bag of words model on the spam dataset.

In [51]:
# load data
discourseData = pd.read_csv(r"C:\Masters\Masters in Data Science\Research\EDA\Data\Revised Data\FinalData\Google_Cloud\2HVSingleTextClassification_v2.txt", sep = "\t", names=["label", "message"])
discourseData.head()

Unnamed: 0,label,message
0,Other,Life is a song - sing it. Life is a game - pla...
1,Love,Life is love - enjoy it.
2,Right_Conduct,"Do not use poisonous words against anyone, for..."
3,Other,The end of education is character
4,Other,"As worldly thoughts diminish, thoughts of God ..."


In [52]:
# extract the messages from the dataframe
messages = discourseData.message
labels = discourseData.label
#print(messages)
print(len(messages))
print(len(labels))

674
674


In [53]:
# convert messages into list
messages = [message for message in messages]
print(len(messages))

674


In [54]:
# preprocess messages using the preprocess function
messages = [preprocess(message) for message in messages]
print(len(messages))

674


In [55]:
# bag of words model
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)
#print(bow_model.toarray())
print(bow_model.shape[0])

674


In [56]:
print(bow_model.shape)
print(vectorizer.get_feature_names())

(674, 4700)
['28', '30', '450', 'aathmashuddhi', 'aberrations', 'abhangs', 'abhayatwam', 'abhimanyu', 'abide', 'abilities', 'ability', 'able', 'abode', 'abomination', 'abounding', 'absence', 'absent', 'absolute', 'absolutely', 'absorbed', 'abundant', 'accept', 'acceptable', 'accepted', 'accidents', 'accomplished', 'accomplishments', 'accord', 'according', 'accordingly', 'account', 'accumulated', 'acharyadevo', 'achieve', 'achieved', 'achievement', 'achievements', 'achieves', 'achieving', 'acknowledge', 'acquire', 'acquired', 'acquires', 'acquiring', 'acquisition', 'across', 'act', 'acted', 'acting', 'action', 'actions', 'active', 'actively', 'activities', 'activity', 'actors', 'acts', 'actually', 'add', 'added', 'addicted', 'adding', 'addition', 'additional', 'address', 'adharma', 'adhere', 'adherence', 'adheres', 'adhering', 'adolescence', 'adopted', 'adopting', 'adoration', 'adore', 'adored', 'adoring', 'adulthood', 'advancement', 'advantage', 'advent', 'adventure', 'advice', 'advisa

## Stemming and lemmatising

In [57]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

# add stemming and lemmatisation in the preprocess function
def preprocess(document, stem=True):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english") and re.sub(r'\b\w{1,3}\b', '', word)]
    
    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]

    # join words to make sentence
    document = " ".join(words)
    
    return document

### Bag of words model on stemmed messages

In [58]:
## initialise the inbuilt Stemmer and the Lemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
import re

In [59]:
# stem messages
messages = [preprocess(message, stem=False) for message in discourseData.message]

# bag of words model
vectorizer = CountVectorizer()
count_vect = vectorizer.fit_transform(messages)
print(count_vect.shape[0])

674


In [60]:
# look at the dataframe
print(count_vect.shape[0])
pd.DataFrame(count_vect.toarray(), columns = vectorizer.get_feature_names())

674


Unnamed: 0,30,aathmashuddhi,aberrations,abhangs,abhayatwam,abhimanyu,abide,abilities,ability,able,...,youth,yuge,yukti,zenith,zero,zoroaster,ānanda,ātmanivedanam,ātmanivedanaṃ,īshwara
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
# token names
print(vectorizer.get_feature_names())

['30', 'aathmashuddhi', 'aberrations', 'abhangs', 'abhayatwam', 'abhimanyu', 'abide', 'abilities', 'ability', 'able', 'abomination', 'abound', 'absence', 'absent', 'absolute', 'absolutely', 'absorb', 'abundant', 'accept', 'acceptable', 'accidents', 'accomplish', 'accomplishments', 'accord', 'accordingly', 'account', 'accumulate', 'acharyadevo', 'achieve', 'achievement', 'achievements', 'acknowledge', 'acquire', 'acquisition', 'across', 'act', 'action', 'active', 'actively', 'activities', 'activity', 'actors', 'actually', 'add', 'addict', 'addition', 'additional', 'address', 'adharma', 'adhere', 'adherence', 'adolescence', 'adopt', 'adoration', 'adore', 'adulthood', 'advancement', 'advantage', 'advent', 'adventure', 'advice', 'advisable', 'advise', 'adviser', 'adwaitaṃ', 'afar', 'affair', 'affairs', 'affect', 'affection', 'affinity', 'affirm', 'afford', 'afraid', 'afterlife', 'afterwards', 'againt', 'age', 'aggravate', 'aggressive', 'agitate', 'agitation', 'ahamkāra', 'ahaṃkāraṃ', 'ahim

### Let's try lemmatizing the messages.

In [62]:
# lemmatise messages
messages = [preprocess(message, stem=False) for message in discourseData.message]



In [63]:
# look at the dataframe
pd.DataFrame(count_vect.toarray(), columns = vectorizer.get_feature_names())
# token names
print(vectorizer.get_feature_names())

['30', 'aathmashuddhi', 'aberrations', 'abhangs', 'abhayatwam', 'abhimanyu', 'abide', 'abilities', 'ability', 'able', 'abomination', 'abound', 'absence', 'absent', 'absolute', 'absolutely', 'absorb', 'abundant', 'accept', 'acceptable', 'accidents', 'accomplish', 'accomplishments', 'accord', 'accordingly', 'account', 'accumulate', 'acharyadevo', 'achieve', 'achievement', 'achievements', 'acknowledge', 'acquire', 'acquisition', 'across', 'act', 'action', 'active', 'actively', 'activities', 'activity', 'actors', 'actually', 'add', 'addict', 'addition', 'additional', 'address', 'adharma', 'adhere', 'adherence', 'adolescence', 'adopt', 'adoration', 'adore', 'adulthood', 'advancement', 'advantage', 'advent', 'adventure', 'advice', 'advisable', 'advise', 'adviser', 'adwaitaṃ', 'afar', 'affair', 'affairs', 'affect', 'affection', 'affinity', 'affirm', 'afford', 'afraid', 'afterlife', 'afterwards', 'againt', 'age', 'aggravate', 'aggressive', 'agitate', 'agitation', 'ahamkāra', 'ahaṃkāraṃ', 'ahim

In [64]:
## - shuffle the pack to create a random and unbiased split of the dataset
import random
from sklearn.utils import shuffle
discourseData = shuffle(discourseData)


In [65]:
## - creating slicing index at 80% threshold
sliceIndex = (int((len(discourseData)*.8)))

In [66]:
trainDF = pd.DataFrame()

In [67]:

trainDF['text'] = discourseData['message']
trainDF['label'] = discourseData['label']

trainDF, testDF = trainDF[:sliceIndex], trainDF[sliceIndex:]

In [68]:


#print(trainDF.head())

# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'],test_size = 0.20)

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

print(len(train_x))
print(len(valid_y))

431
108


In [69]:
print(valid_x)

526    Calling themselves Sai devotees or devotees of...
563                         Always Be Happy and Peaceful
208                                   Love bestows bliss
608    For want of these five human values, mankind i...
570    Do not bite more than you can chew. This maxim...
631    Buddha taught one great truth to the world. He...
611    Even in drinking water, you should observe res...
542    There should be some regulations with regard t...
308    Love is like a mariner’s compass, which always...
472    The meaning of Ahimsa is that either in though...
219    Another form of forbearance is love. Love is a...
395    Selfishness has grown and selflessness has dec...
427    As is the food, so is the head. As is the head...
269    Another special thing about repetition of the ...
38     Duty without love is deplorable. Duty with lov...
80     What do we lose by another's good fortune? Let...
572    Never leave the mind vacant.Empty mind is the ...
69     On previous occasions wh

In [70]:
# create a count vectorizer object - Bag of Words
vectorizer = CountVectorizer()
count_vect = vectorizer.fit(trainDF['text'])


# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [71]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x)

In [23]:
print(xtrain_tfidf_ngram_chars)

  (0, 4831)	0.045709611774955916
  (0, 4830)	0.04503177633901608
  (0, 4817)	0.017469726114326956
  (0, 4813)	0.01641875531506452
  (0, 4772)	0.026102702545710583
  (0, 4771)	0.026102702545710583
  (0, 4760)	0.021714243387528413
  (0, 4738)	0.012578604455520108
  (0, 4701)	0.04220538139719506
  (0, 4700)	0.03962772675335276
  (0, 4671)	0.01983187483604186
  (0, 4665)	0.09320073196417751
  (0, 4661)	0.08156270562027504
  (0, 4657)	0.08710182710181229
  (0, 4655)	0.06441398682502926
  (0, 4620)	0.04722409857640602
  (0, 4619)	0.04722409857640602
  (0, 4578)	0.06282365318237619
  (0, 4567)	0.034547234935911726
  (0, 4562)	0.019103784983731026
  (0, 4554)	0.05259070045499122
  (0, 4553)	0.03244696383830609
  (0, 4551)	0.028226357203785763
  (0, 4550)	0.07048711229787707
  (0, 4548)	0.03616135660348113
  :	:
  (430, 121)	0.02261035205577699
  (430, 120)	0.0154063756758395
  (430, 115)	0.02569993559320507
  (430, 114)	0.030388272156839247
  (430, 111)	0.0330551790038393
  (430, 110)	0.022698

In [72]:
# Utility function to train models

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [30]:
#Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.7222222222222222
NB, WordLevel TF-IDF:  0.49074074074074076
NB, N-Gram Vectors:  0.5648148148148148
NB, CharLevel Vectors:  0.4444444444444444


In [31]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print ("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("LR, CharLevel Vectors: ", accuracy)

LR, Count Vectors:  0.8240740740740741
LR, WordLevel TF-IDF:  0.7870370370370371
LR, N-Gram Vectors:  0.6018518518518519
LR, CharLevel Vectors:  0.75




In [32]:
accuracy = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)
print ("Count Vectors: ", accuracy)

accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("SVM, WordLevel TF-IDF: ", accuracy)

accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)

accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("CharLevel Vectors: ", accuracy)



Count Vectors:  0.4444444444444444
SVM, WordLevel TF-IDF:  0.4166666666666667
SVM, N-Gram Vectors:  0.4166666666666667
CharLevel Vectors:  0.4166666666666667


In [33]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print ("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("RF, WordLevel TF-IDF: ", accuracy)

# RF on Word Level TF IDF Vectors - n grams
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("RF, WordLevel TF-IDF n gram : ", accuracy)

# RF on Character Level TF IDF Vectors - n grams
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("RF, CharLevel TF-IDF: ", accuracy)


RF, Count Vectors:  0.7037037037037037
RF, WordLevel TF-IDF:  0.7129629629629629
RF, WordLevel TF-IDF n gram :  0.6296296296296297
RF, CharLevel TF-IDF:  0.7222222222222222




In [34]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors n grams
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc())
print ("Xgb, WordLevel n grams TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print ("Xgb, CharLevel Vectors: ", accuracy)

Xgb, Count Vectors:  0.8425925925925926
Xgb, WordLevel TF-IDF:  0.8703703703703703
Xgb, WordLevel n grams TF-IDF:  0.7222222222222222
Xgb, CharLevel Vectors:  0.7870370370370371


In [73]:
# https://towardsdatascience.com/k-nearest-neighbor-python-2fccc47d2a55
#knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
#knn.fit(X_train, y_train)
#sns.scatterplot(
#    x='mean area',
#    y='mean compactness',
#    hue='benign',
#    data=X_test.join(y_test, how='outer')
#)
#
#plt.scatter(
#    X_test['mean area'],
#    X_test['mean compactness'],
#    c=y_pred,
#    cmap='coolwarm',
#    alpha=0.7
#)
#confusion_matrix(y_test, y_pred)
#https://www.ritchieng.com/machine-learning-k-nearest-neighbors-knn/

#kNN

#knn = KNeighborsClassifier(n_neighbors=5)
#knn.fit(xtrain_count, train_y)
#y_pred = knn.predict(xvalid_count)
#print(metrics.accuracy_score(xvalid_count, y_pred))

# kNN on Count Vectors
accuracy = train_model(KNeighborsClassifier(n_neighbors=5), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print ("kNN, Count Vectors: ", accuracy)

# kNN on Word Level TF IDF Vectors
accuracy = train_model(KNeighborsClassifier(n_neighbors=5), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print ("kNN, WordLevel TF-IDF: ", accuracy)

# kNN on Word Level TF IDF Vectors n grams
accuracy = train_model(KNeighborsClassifier(n_neighbors=5), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc())
print ("kNN, WordLevel n grams TF-IDF: ", accuracy)

# kNN on Character Level TF IDF Vectors
accuracy = train_model(KNeighborsClassifier(n_neighbors=5), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print ("kNN, CharLevel Vectors: ", accuracy)


kNN, Count Vectors:  0.6203703703703703
kNN, WordLevel TF-IDF:  0.6203703703703703
kNN, WordLevel n grams TF-IDF:  0.5
kNN, CharLevel Vectors:  0.5555555555555556


In [74]:
#Finding N for Count Vectors
#https://www.ritchieng.com/machine-learning-k-nearest-neighbors-knn/
# try K=1 through K=25 and record testing accuracy
k_range = range(1, 26)
# We can create Python dictionary using [] or dict()
scores = []
# We use a loop through the range 1 to 26
# We append the scores in the dictionary
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(xtrain_count, train_y)
    y_pred = knn.predict(xvalid_count)
    scores.append(metrics.accuracy_score(valid_y, y_pred))
print(scores)

[0.6111111111111112, 0.5925925925925926, 0.6018518518518519, 0.6388888888888888, 0.6203703703703703, 0.5740740740740741, 0.5648148148148148, 0.5555555555555556, 0.5648148148148148, 0.5648148148148148, 0.5462962962962963, 0.5648148148148148, 0.5740740740740741, 0.5648148148148148, 0.5462962962962963, 0.5648148148148148, 0.5648148148148148, 0.5462962962962963, 0.5370370370370371, 0.5277777777777778, 0.5185185185185185, 0.5277777777777778, 0.5462962962962963, 0.5277777777777778, 0.5555555555555556]


In [76]:
#Finding N for TF-IDF
#https://www.ritchieng.com/machine-learning-k-nearest-neighbors-knn/
# try K=1 through K=25 and record testing accuracy
k_range = range(1, 26)
# We can create Python dictionary using [] or dict()
scores = []
# We use a loop through the range 1 to 26
# We append the scores in the dictionary
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(xtrain_tfidf, train_y)
    y_pred = knn.predict(xvalid_tfidf)
    scores.append(metrics.accuracy_score(valid_y, y_pred))
print(scores)

[0.5833333333333334, 0.5555555555555556, 0.6296296296296297, 0.6296296296296297, 0.6203703703703703, 0.5925925925925926, 0.6018518518518519, 0.6018518518518519, 0.6111111111111112, 0.6296296296296297, 0.6203703703703703, 0.6018518518518519, 0.6111111111111112, 0.6018518518518519, 0.5833333333333334, 0.6203703703703703, 0.6018518518518519, 0.6018518518518519, 0.5925925925925926, 0.6018518518518519, 0.5833333333333334, 0.5925925925925926, 0.5925925925925926, 0.5833333333333334, 0.5925925925925926]


In [77]:
#Finding N for TF-IDF n-grams
#https://www.ritchieng.com/machine-learning-k-nearest-neighbors-knn/
# try K=1 through K=25 and record testing accuracy
k_range = range(1, 26)
# We can create Python dictionary using [] or dict()
scores = []
# We use a loop through the range 1 to 26
# We append the scores in the dictionary
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(xtrain_tfidf_ngram, train_y)
    y_pred = knn.predict(xvalid_tfidf_ngram)
    scores.append(metrics.accuracy_score(valid_y, y_pred))
print(scores)

[0.42592592592592593, 0.49074074074074076, 0.48148148148148145, 0.37962962962962965, 0.37037037037037035, 0.42592592592592593, 0.4166666666666667, 0.4537037037037037, 0.4722222222222222, 0.6111111111111112, 0.5925925925925926, 0.5925925925925926, 0.6203703703703703, 0.6111111111111112, 0.5925925925925926, 0.6388888888888888, 0.6296296296296297, 0.6203703703703703, 0.6388888888888888, 0.6018518518518519, 0.6018518518518519, 0.6388888888888888, 0.6296296296296297, 0.6388888888888888, 0.6018518518518519]


In [78]:
#Finding N for TF-IDF n-grams Char
#https://www.ritchieng.com/machine-learning-k-nearest-neighbors-knn/
# try K=1 through K=25 and record testing accuracy
k_range = range(1, 26)
# We can create Python dictionary using [] or dict()
scores = []
# We use a loop through the range 1 to 26
# We append the scores in the dictionary
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(xtrain_tfidf_ngram_chars, train_y)
    y_pred = knn.predict(xvalid_tfidf_ngram_chars)
    scores.append(metrics.accuracy_score(valid_y, y_pred))
print(scores)

[0.6018518518518519, 0.5740740740740741, 0.5648148148148148, 0.5833333333333334, 0.5555555555555556, 0.5555555555555556, 0.5462962962962963, 0.5462962962962963, 0.5555555555555556, 0.5277777777777778, 0.5555555555555556, 0.5462962962962963, 0.5462962962962963, 0.5462962962962963, 0.5462962962962963, 0.5370370370370371, 0.5277777777777778, 0.5370370370370371, 0.5370370370370371, 0.5462962962962963, 0.5462962962962963, 0.5462962962962963, 0.5462962962962963, 0.5462962962962963, 0.5833333333333334]


In [1]:
print(messages)

NameError: name 'messages' is not defined

In [83]:
#Refer to this later
#http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/ 
#https://stackabuse.com/implementing-word2vec-with-gensim-library-in-python/
#https://towardsdatascience.com/word2vec-from-scratch-with-numpy-8786ddd49e72

#Use this https://towardsdatascience.com/nlp-performance-of-different-word-embeddings-on-text-classification-de648c6262b
#https://github.com/TomLin/Playground/blob/master/04-Model-Comparison-Word2vec-Doc2vec-TfIdfWeighted.ipynb

