In [1]:
# Main Imports
import numpy as np # Arrays
import pandas as pd # Dataframes
import sqlite3 # SQL databases

# Utilities
import joblib

# Helpers
from helpers import tokenize, preprocess

In [2]:
conn = sqlite3.connect("tweets.db")
labelled_tweets = pd.read_sql_query("SELECT * FROM tweets WHERE notify IS NOT NULL", conn)
conn.close()

In [3]:
X = labelled_tweets[['author', 'content', 'has_link', 'has_video', 'has_image']]
y = labelled_tweets['notify']

print(f"""Total Number of Tweets: {len(y)}
Number of notifying tweets: {(y == "True").sum()}
Notification percentage: {(y == "True").sum() / len(y) * 100:.2f}%""")

Total Number of Tweets: 1820
Number of notifying tweets: 355
Notification percentage: 19.51%


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC

# categorical_features = ['author', 'has_link', 'has_video', 'has_image']
categorical_features = ['has_link', 'has_video', 'has_image']

text_features = 'content'
text_transformer = Pipeline([
        ('vect', CountVectorizer(preprocessor=preprocess, tokenizer=tokenize, ngram_range = (1,2))),
        ('tfidf', TfidfTransformer())
])

                               
preprocessor = ColumnTransformer([
        ('categories', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('text', text_transformer, text_features)
])
                                   
clf = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LinearSVC(class_weight='balanced'))
])

score = cross_val_score(clf, X, y, cv=20)
print(f"Score: {score.mean():.2f}")


clf.fit(X, y)
None

Score: 0.94


In [5]:
joblib.dump(clf, "model.joblib")

['model.joblib']

In [6]:
predicted_scores = cross_val_predict(clf, X, y, cv=20, method="decision_function")
correctness_scores = ((((y == 'True') * 2) - 1) * predicted_scores)
correctness_scores # Will be positive if correct and negative if wrong
ordered_tweet_indexes = np.argsort(correctness_scores)
i = 0

In [108]:
index = ordered_tweet_indexes[i]
print(f"Id:      {labelled_tweets.loc[index].id}")
print(f"Content: {labelled_tweets.loc[index].content}")
print(f"Label:   {y[index]}")
print(f"Score:   {predicted_scores[index]}")
i += 1

Id:      1192939808873730048
Content: Bit of a long shot, but bills got me all kinds of stressed so would anyone be interested in buying this nasty boy? 🤔Would include the full body and two portraits flat colored, w/ and w/o clothes, and all the sketches would be yours to keep as well for $150. https://t.co/EmnP3YU7EB
Label:   True
Score:   -0.04535574392530267


In [100]:
from helpers import label_tweet
label_tweet('1195171698255048704', 'cb_False')