In [223]:
# Main Imports
import numpy as np # Arrays
import pandas as pd # Dataframes
import sqlite3 # SQL databases

# Utilities
import joblib

# Helpers
from helpers import tokenise

In [224]:
conn = sqlite3.connect("tweets.db")
labelled_tweets = pd.read_sql_query("SELECT * FROM tweets WHERE notify IS NOT NULL", conn)
conn.close()

In [225]:
X = labelled_tweets[['author', 'content', 'has_link', 'has_video', 'has_image']]
y = labelled_tweets['notify']

print(f"""Total Number of Tweets: {len(y)}
Number of notifying tweets: {(y == "True").sum()}
Notification percentage: {(y == "True").sum() / len(y) * 100:.2f}%""")

Total Number of Tweets: 952
Number of notifying tweets: 104
Notification percentage: 10.92%


In [228]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC

categorical_features = ['author', 'has_link', 'has_video', 'has_image']

text_features = 'content'
text_transformer = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenise, ngram_range = (1,2))),
        ('tfidf', TfidfTransformer())
])

                               
preprocessor = ColumnTransformer([
        ('categories', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('text', text_transformer, text_features)
])
                                   
clf = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LinearSVC(class_weight='balanced'))
])

score = cross_val_score(clf, X, y, cv=20)
print(f"Score: {score.mean():.2f}")
predicted_scores = cross_val_predict(clf, X, y, cv=20, method="decision_function")
correctness_scores = ((((y == 'True') * 2) - 1) * predicted_scores)


clf.fit(X, y)
None

Score: 0.95


In [229]:
tokenise.__module__
joblib.dump(clf, "model.joblib")

['model.joblib']

In [230]:
correctness_scores # Will be positive if correct and negative if wrong
ordered_tweet_indexes = np.argsort(correctness_scores)
i = 0

In [241]:
index = ordered_tweet_indexes[i]
print(f"Id:      {labelled_tweets.loc[index].id}")
print(f"Content: {labelled_tweets.loc[index].content}")
print(f"Label:   {y[index]}")
print(f"Score:   {predicted_scores[index]}")
i += 1

Id:      1193598695872385025
Content: Halloween is over so these are now $5 off! From $15 to $10! 

Get this large 8 1/2 x 12 print now before they’re gone for good! https://t.co/quBh0IyusG
Label:   False
Score:   0.5649532223569064


In [180]:
from helpers import label_tweet
label_tweet('1193740240688689152', 'cb_True')