In [1]:
# Main Imports
import numpy as np # Arrays
import pandas as pd # Dataframes
import sqlite3 # SQL databases

# Utilities
import joblib

# Helpers
from helpers import tokenise

In [2]:
conn = sqlite3.connect("tweets.db")
labelled_tweets = pd.read_sql_query("SELECT * FROM tweets WHERE notify IS NOT NULL", conn)
conn.close()

In [3]:
labelled_tweets

Unnamed: 0,id,author,content,has_link,has_video,has_image,is_reply,is_retweet,is_quote_rt,notify
0,1192437222953431040,483510314,Peaceful Platypus Adopt! 😊❤️👀\n(Includes trans...,False,False,True,False,False,False,True
1,1192437932365246464,504501220,Oh shoot there were peanuts in my dinner; look...,False,False,False,False,False,False,False
2,1192440252406874117,4838841508,do you guys care for procreate timelapse video...,False,False,False,False,False,False,False
3,1192440492560179205,2491425757,"Heya, gonna stream doodling for a short while\...",True,False,True,False,False,False,False
4,1192440878712983552,872570861504651264,art school be like [carries around a bottle of...,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
348,1192952556290359296,2602469600,Some kitty adopts &lt;3\n\nPrice: $15-$20 (Fir...,False,False,True,False,False,False,True
349,1192954500555923457,2830247462,Here is a cutie patoot kitty! She is an offer ...,False,False,True,False,False,False,True
350,1192956395840135168,2545886376,Opening this here for one slot! Comment to cla...,False,False,True,False,False,False,True
351,1192962723274989568,2996302298,My mood absolutely tanked and if I'm being hon...,False,False,False,False,False,False,False


In [4]:
X = labelled_tweets[['author', 'content', 'has_link', 'has_video', 'has_image']]
y = labelled_tweets['notify']

print(f"""Total Number of Tweets: {len(y)}
Number of notifying tweets: {(y == "True").sum()}
Notification percentage: {(y == "True").sum() / len(y) * 100:.2f}%""")

Total Number of Tweets: 353
Number of notifying tweets: 28
Notification percentage: 7.93%


In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC

categorical_features = ['author', 'has_link', 'has_video', 'has_image']

text_features = 'content'
text_transformer = Pipeline([
        ('vect', CountVectorizer(tokenizer = tokenise, ngram_range = (1,2))),
        ('tfidf', TfidfTransformer())
])

                               
preprocessor = ColumnTransformer([
        ('categories', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('text', text_transformer, text_features)
])
                                   
clf = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LinearSVC())
])

score = cross_val_score(clf, X, y, cv=5)
clf.fit(X, y)
score.mean()

In [8]:
tokenise.__module__
joblib.dump(clf, "model.joblib")

['model.joblib']