In [1]:
import pandas as pd
import numpy as np
import nltk

from sklearn.metrics import confusion_matrix

# Read and Clean Data

In [2]:
input_df = pd.read_csv('train.csv')

In [3]:
input_df.head()

Unnamed: 0,Tweet,following,followers,actions,is_retweet,location,Type
0,Good Morning Love @LeeBrown_V,0.0,0.0,0.0,0.0,"Pennsylvania, USA",Quality
1,'@realDonaldTrump @USNavy RIP TO HEROES',42096.0,61060.0,5001.0,0.0,"South Padre Island, Texas",Spam
2,Haven't been following the news but I understa...,0.0,0.0,,0.0,Will never be broke ever again,Quality
3,pic.twitter.com/dy9q4ftLhZ What to do with pap...,0.0,0.0,0.0,0.0,Mundo,Quality
4,#DidYouKnow ► Mahatma Gandhi made a brief visi...,17800.0,35100.0,,0.0,"Nottingham, England",Quality


In [4]:
input_df.isnull().sum()

Tweet            0
following      158
followers       17
actions       3437
is_retweet       1
location      2011
Type             0
dtype: int64

In [5]:
input_df.drop(columns=['location', 'actions'], inplace=True)
input_df.dropna()

Unnamed: 0,Tweet,following,followers,is_retweet,Type
0,Good Morning Love @LeeBrown_V,0.0,0.0,0.0,Quality
1,'@realDonaldTrump @USNavy RIP TO HEROES',42096.0,61060.0,0.0,Spam
2,Haven't been following the news but I understa...,0.0,0.0,0.0,Quality
3,pic.twitter.com/dy9q4ftLhZ What to do with pap...,0.0,0.0,0.0,Quality
4,#DidYouKnow ► Mahatma Gandhi made a brief visi...,17800.0,35100.0,0.0,Quality
...,...,...,...,...,...
14894,"#AllWentWrongWhen I told my hair stylist to ""g...",695.0,533.0,1.0,Spam
14895,"They don't have to like you, and you don't hav...",0.0,0.0,0.0,Quality
14896,#Miami Graham Nash Live at Parker Playhouse #...,5647.0,15091.0,0.0,Spam
14897,@bethannhamilton is in the business of one-upp...,0.0,0.0,0.0,Quality


In [6]:
input_df.drop(input_df[(input_df['Type'] != 'Quality') & (input_df['Type'] != 'Spam')].index, inplace=True)
input_df.Type.unique()

array(['Quality', 'Spam'], dtype=object)

In [7]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(input_df.Type.unique().tolist())
print(list(le.classes_))
input_df['Type'] = le.transform(input_df['Type'])

['Quality', 'Spam']


In [8]:
input_df.head()

Unnamed: 0,Tweet,following,followers,is_retweet,Type
0,Good Morning Love @LeeBrown_V,0.0,0.0,0.0,0
1,'@realDonaldTrump @USNavy RIP TO HEROES',42096.0,61060.0,0.0,1
2,Haven't been following the news but I understa...,0.0,0.0,0.0,0
3,pic.twitter.com/dy9q4ftLhZ What to do with pap...,0.0,0.0,0.0,0
4,#DidYouKnow ► Mahatma Gandhi made a brief visi...,17800.0,35100.0,0.0,0


In [9]:
from nltk.corpus import stopwords

# Drop stopwords in tweets and convert the text to lowercase
filtered_text = []
for i in range(0,len(input_df)):
    word_list = str(input_df.Tweet.iloc[i]).split()
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]
    filtered_text.append(' '.join(filtered_words))
input_df['filtered_text'] = filtered_text
input_df['filtered_text_lower'] = input_df['filtered_text'].str.lower()

In [10]:
input_df.head()

Unnamed: 0,Tweet,following,followers,is_retweet,Type,filtered_text,filtered_text_lower
0,Good Morning Love @LeeBrown_V,0.0,0.0,0.0,0,Good Morning Love @LeeBrown_V,good morning love @leebrown_v
1,'@realDonaldTrump @USNavy RIP TO HEROES',42096.0,61060.0,0.0,1,'@realDonaldTrump @USNavy RIP TO HEROES','@realdonaldtrump @usnavy rip to heroes'
2,Haven't been following the news but I understa...,0.0,0.0,0.0,0,Haven't following news I understand #EFF dumbe...,haven't following news i understand #eff dumbe...
3,pic.twitter.com/dy9q4ftLhZ What to do with pap...,0.0,0.0,0.0,0,pic.twitter.com/dy9q4ftLhZ What paper scissors...,pic.twitter.com/dy9q4ftlhz what paper scissors...
4,#DidYouKnow ► Mahatma Gandhi made a brief visi...,17800.0,35100.0,0.0,0,#DidYouKnow ► Mahatma Gandhi made brief visit ...,#didyouknow ► mahatma gandhi made brief visit ...


In [11]:
ham_df = input_df[input_df.Type == le.classes_.tolist().index('Quality')]
spam_df = input_df[input_df.Type == le.classes_.tolist().index('Spam')]

ham_df = ham_df.sample(len(spam_df))
df = pd.concat([ham_df, spam_df])
df.head()

Unnamed: 0,Tweet,following,followers,is_retweet,Type,filtered_text,filtered_text_lower
13000,Ability to think for yourself. \n#ObsoleteJobS...,0.0,14100.0,0.0,0,Ability think yourself. #ObsoleteJobSkills,ability think yourself. #obsoletejobskills
1240,#Values define what people think is important ...,0.0,0.0,0.0,0,#Values define people think important build be...,#values define people think important build be...
13489,I need to sort shit out,0.0,0.0,0.0,0,I need sort shit,i need sort shit
7350,Great traditonal pubs and restaurants in #Surr...,0.0,0.0,0.0,0,Great traditonal pubs restaurants #Surrey #Sus...,great traditonal pubs restaurants #surrey #sus...
5868,People So 50,0.0,0.0,0.0,0,People So 50,people so 50


In [12]:
print("Number of Spam tweets:",len(spam_df))
print("Number of Ham tweets:",len(ham_df))

Number of Spam tweets: 7443
Number of Ham tweets: 7443


# Random Forest Modeling

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=50)
tfidf_vector = tfidf_vectorizer.fit_transform(df['filtered_text_lower'])

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf_vector.toarray(), df.Type, test_size=0.2, random_state=42)

In [15]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [16]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_pred, average='weighted')
print("F-measure for this model: ", f1*100, "%")

F-measure for this model:  85.6831930590714 %


In [17]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
print("MAE for this model: ", mae)

MAE for this model:  0.1417058428475487


In [18]:
from sklearn.metrics import confusion_matrix

mat = confusion_matrix(y_test, y_pred)

TP = mat[0][0]
FP = mat[0][1]
FN = mat[1][0]
TN = mat[1][1]

# Sensitivity, hit rate, recall, or true positive rate
recall = TP/(TP+FN)
print("Recall: ",round(recall*100,2),"%")

# Precision or positive predictive value
precision = TP/(TP+FP)
print("Precision: ",round(precision*100,2),"%")

# Accuracy
accuracy = (TP+TN)/(TP+FP+FN+TN)
print("Accuracy: ",round(accuracy*100,2),"%")

Recall:  80.06 %
Precision:  95.66 %
Accuracy:  85.83 %
