This Notebook Contains the code that analyzes the dataset for most commonly used spam words 

The following loads in the dataset of tweets

In [1]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
df = pd.read_table("results1.txt",sep=",")
df = df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1)
df.head()

Unnamed: 0,0,b'People should not have to worry about being shot while shopping or attending school. This is getting ridiculous. Co https://t.co/83LVF1URjk'
0,0,b'I Need To Go Shopping! I Need New Clothes '
1,0,b'@hawillisdc I am still shopping for a good s...
2,0,"b'Funny, you start charging a $1 deposit for s..."
3,0,b'Entre tapas &amp; tapas &amp; socos &amp; ch...
4,0,b'Google Shopping Via Image Optimization:Secre...


In [3]:
print(len(df))

13058


The bag of words method was implemented using the CountVectorizer method from SciKit Learn

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words='english',min_df = 0.001)
df.columns = ['label', 'tweet']

The following sorts every word in tweets labeled spam by frequency. The most frequent words are giveaway(1587), away(1537), click(1428), crypotcurrency(1124), and win(903). 

In [17]:
isSpam = df['label'] == '1'
notSpam = df['label'] == '0'
spam = df[isSpam]
spam = spam['tweet']
spamwords = count_vector.fit_transform(spam)
sum_words = spamwords.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vector.vocabulary_.items()]
print(sorted(words_freq, key = lambda x: x[1], reverse=True))
print(len(sorted(words_freq, key = lambda x: x[1], reverse=True)))


[('https', 8333), ('giveaway', 1587), ('away', 1537), ('click', 1428), ('cryptocurrency', 1124), ('win', 903), ('free', 584), ('gt', 525), ('just', 505), ('enter', 498), ('kardashian', 462), ('bitcoin', 448), ('amp', 430), ('crypto', 354), ('kim', 344), ('news', 341), ('new', 339), ('nhttps', 335), ('join', 299), ('watch', 292), ('magazine', 271), ('celebrity', 257), ('people', 248), ('000', 248), ('celeb', 243), ('video', 235), ('celebs', 233), ('100', 232), ('time', 224), ('like', 223), ('live', 217), ('im', 217), ('blockchain', 217), ('check', 216), ('want', 210), ('gift', 206), ('btc', 199), ('youtube', 191), ('kimkardashian', 186), ('giving', 185), ('2018', 179), ('nclick', 176), ('airdrop', 175), ('entered', 173), ('love', 169), ('don', 165), ('day', 163), ('chance', 160), ('10', 157), ('snapchat', 156), ('nadd', 154), ('card', 153), ('dont', 151), ('ethereum', 151), ('big', 150), ('good', 147), ('come', 138), ('buy', 133), ('going', 132), ('great', 126), ('goal', 125), ('support

The following sorts every word in tweets not labeled spam by frequency. The most frequent words are kardashian(1045 appearances), kim(522), cryptocurrency(242), new(231) and like(204). 

In [18]:
ham = df[notSpam]
ham = ham['tweet']
hamwords = count_vector.fit_transform(ham)

sum_words = hamwords.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vector.vocabulary_.items()]
print(sorted(words_freq, key = lambda x: x[1], reverse=True))
print(len(sorted(words_freq, key = lambda x: x[1], reverse=True)))

[('https', 4510), ('kardashian', 1045), ('kim', 522), ('rt', 420), ('cryptocurrency', 242), ('new', 231), ('like', 204), ('just', 200), ('click', 182), ('nhttps', 176), ('shopping', 156), ('java', 154), ('khloe', 151), ('people', 149), ('kardashians', 143), ('data', 127), ('playing', 125), ('baby', 124), ('ai', 118), ('jenner', 112), ('love', 105), ('trump', 99), ('good', 98), ('amp', 97), ('make', 93), ('im', 92), ('bigdata', 91), ('kourtney', 90), ('time', 88), ('listen', 83), ('west', 81), ('president', 81), ('bitcoin', 81), ('know', 77), ('free', 77), ('look', 76), ('great', 76), ('machinelearning', 75), ('datascience', 75), ('news', 73), ('8rnluayhzd', 72), ('day', 71), ('come', 70), ('think', 69), ('north', 69), ('abdsc', 69), ('dont', 68), ('congress', 68), ('need', 67), ('house', 67), ('learning', 67), ('video', 65), ('today', 63), ('going', 63), ('year', 61), ('really', 61), ('watch', 61), ('kylie', 61), ('big', 59), ('facebook', 59), ('japan', 59), ('pink', 58), ('2018', 58),

In [8]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], random_state=1)
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

In [6]:
print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 13058
Number of rows in the training set: 9793
Number of rows in the test set: 3265


In [9]:
from sklearn import svm
print("SVM")
clf = svm.SVC(probability=True, C=1000)
clf.fit(training_data,y_train)
svmpredictions = clf.predict(testing_data)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test,svmpredictions )))
print('Precision score: ', format(precision_score(y_test,svmpredictions,average='micro' )))
print('Recall score: ', format(recall_score(y_test,svmpredictions,average='micro' )))
print('F1 score: ', format(f1_score(y_test,svmpredictions,average='micro')))

SVM
Accuracy score:  0.8637059724349158
Precision score:  0.8637059724349158
Recall score:  0.8637059724349158
F1 score:  0.8637059724349158


In [10]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(100,100,100), max_iter=500, alpha=0.0001,
                     solver='adam', verbose=10,  random_state=21,tol=0.000000001)
clf.fit(training_data,y_train)
predictions = clf.predict(testing_data)
print("Multi-Layer Perceptron")
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test,predictions)))
print('Precision score: ', format(precision_score(y_test,predictions,average='micro')))
print('Recall score: ', format(recall_score(y_test,predictions,average='micro')))
print('F1 score: ', format(f1_score(y_test,predictions,average='micro')))

Iteration 1, loss = 0.60822418
Iteration 2, loss = 0.28317016
Iteration 3, loss = 0.21682388
Iteration 4, loss = 0.17481220
Iteration 5, loss = 0.14128008
Iteration 6, loss = 0.11182816
Iteration 7, loss = 0.09270734
Iteration 8, loss = 0.07989894
Iteration 9, loss = 0.07238263
Iteration 10, loss = 0.06618951
Iteration 11, loss = 0.06252192
Iteration 12, loss = 0.06240368
Iteration 13, loss = 0.05746110
Iteration 14, loss = 0.05629867
Iteration 15, loss = 0.05393966
Iteration 16, loss = 0.05351072
Iteration 17, loss = 0.05041549
Iteration 18, loss = 0.05032314
Iteration 19, loss = 0.04915702
Iteration 20, loss = 0.04875020
Iteration 21, loss = 0.04769428
Iteration 22, loss = 0.04738353
Iteration 23, loss = 0.04846534
Iteration 24, loss = 0.04737059
Iteration 25, loss = 0.04492462
Iteration 26, loss = 0.04526192
Iteration 27, loss = 0.04461115
Iteration 28, loss = 0.04484084
Iteration 29, loss = 0.04412003
Iteration 30, loss = 0.04404542
Iteration 31, loss = 0.04427466
Iteration 32, los

In [11]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data,y_train)
predictions = naive_bayes.predict(testing_data)
print("Naive Bayes")
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test,predictions)))
print('Precision score: ', format(precision_score(y_test,predictions,average='micro')))
print('Recall score: ', format(recall_score(y_test,predictions,average='micro')))
print('F1 score: ', format(f1_score(y_test,predictions,average='micro')))

Naive Bayes
Accuracy score:  0.845635528330781
Precision score:  0.845635528330781
Recall score:  0.845635528330781
F1 score:  0.845635528330781


In [12]:
print("Logistic Regression")
from sklearn import linear_model
regr = linear_model.LogisticRegression(random_state=0, solver='sag')
regr.fit(training_data,y_train)
predictions = regr.predict(testing_data)
print('Accuracy score: ', format(accuracy_score(y_test,predictions)))
print('Precision score: ', format(precision_score(y_test,predictions,average='micro')))
print('Recall score: ', format(recall_score(y_test,predictions,average='micro')))
print('F1 score: ', format(f1_score(y_test,predictions,average='micro')))

Logistic Regression
Accuracy score:  0.8704441041347626
Precision score:  0.8704441041347626
Recall score:  0.8704441041347626
F1 score:  0.8704441041347626




In [13]:
from sklearn.ensemble import RandomForestClassifier
print("Random Forest")
clf = RandomForestClassifier(n_jobs=-1, random_state=0, criterion='gini')
clf.fit(training_data,y_train)
predictions = clf.predict(testing_data)
print('Accuracy score: ', format(accuracy_score(y_test,predictions)))
print('Precision score: ', format(precision_score(y_test,predictions,average='micro')))
print('Recall score: ', format(recall_score(y_test,predictions,average='micro')))
print('F1 score: ', format(f1_score(y_test,predictions,average='micro')))

Random Forest
Accuracy score:  0.8698315467075038
Precision score:  0.8698315467075038
Recall score:  0.8698315467075038
F1 score:  0.8698315467075038


In [14]:
print("Tree")
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='gini', random_state=0)
clf.fit(training_data,y_train)
predictions = clf.predict(testing_data)
print('Accuracy score: ', format(accuracy_score(y_test,predictions)))
print('Precision score: ', format(precision_score(y_test,predictions,average='micro')))
print('Recall score: ', format(recall_score(y_test,predictions,average='micro')))
print('F1 score: ', format(f1_score(y_test,predictions,average='micro')))

Tree
Accuracy score:  0.8563552833078101
Precision score:  0.8563552833078101
Recall score:  0.8563552833078101
F1 score:  0.8563552833078102
