In [7]:
import pandas as pd
import pickle
from tensorflow.keras import models
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import unidecode
import contractions
import re

In [8]:
def get_airline_codes():
    airport_code_list = list(pd.read_html('https://en.wikipedia.org/wiki/List_of_airline_codes')[0]['ICAO'].dropna())
    return airport_code_list


def clean(text, airport_code_list):

    for air_code in airport_code_list: # remove airline codes
        text = text.replace(air_code, ' ')

    text = text.lower() # Lower Case

    text = re.sub("@[A-Za-z0-9_]+","", text) # remove mentions
    text = re.sub("#[A-Za-z0-9_]+","", text) # remove hashtags

    text = re.sub(r"http\S+", "", text) # remove links
    text = re.sub(r"www.\S+", "", text) # remove links

    expanded_words = [contractions.fix(word) for word in text.split()] # remove contractions

    text = ' '.join(expanded_words) # join words

    unaccented_string = unidecode.unidecode(text) # remove accents

    tokenized = word_tokenize(unaccented_string) # Tokenize

    stop_words = set(stopwords.words('english')) # Make stopword list

    stop_word_to_keep = ['was', 'are', 'did', 'been', 'have', 'until', 'while', 'about', 'against', 'between', 'during', 'before', 'after', 'again', 'when', 'where', 'why', 'how', 'any', 'not', 'no','very', "aren't", "wasn't", "shouldn't", "should", "won't", "wouldn't"]

    stop_words = [x for x in stopwords.words('english') if x not in stop_word_to_keep]

    without_stopwords = [word for word in tokenized if not word in stop_words] # Remove Stop Words

    lemmatizer = WordNetLemmatizer() # Instantiate lemmatizer

    lemmatized = [lemmatizer.lemmatize(word) for word in without_stopwords] # Lemmatize

    lemmatized = " ".join(lemmatized)

    return lemmatized

In [9]:
df = pd.read_csv('data/raw_data/new tweets.csv')
df['date'] = pd.to_datetime(df['date']).dt.date

In [10]:
df.head()

Unnamed: 0,date,text
0,2023-02-09,Bunker Labs is partnering with @United to laun...
1,2023-02-09,"Most active airlines - Feb 09,2023🌎🌍🌏\n\n1 - @..."
2,2023-02-09,"Congrats to our @RSWAirport NPS winners, Kelli..."
3,2023-02-09,Continent number 7 here I come ✅\n\n#Antarctic...
4,2023-02-09,Congratulations to PAPA Member Michael Choe wh...


In [11]:
X_pred = df['text'].astype("str")

In [12]:
airport_code_list = get_airline_codes()

X_pred = [clean(tweet, airport_code_list) for tweet in X_pred]

In [13]:
X_pred

["bunker lab partnering launch mileageplus ( r ) mile mission campaign . campaign allows people donate mile support travel veteran entrepreneur bunker lab ' training & amp ; workshop . donate mile today !",
 'active airline - feb 09,2023 1 - 2 - 3 - find data :',
 'congrats irport np winner , kelli , ted & amp ; karen ! along peer helped boost performance holiday . helped outperformed industry during last month 2022 .',
 'continent number 7 come bound check penguin whale make sure are good . drake passage better nice .',
 'congratulation p member michael choe recently began training ! michael retired navy after 20-year career flying boeing 737 united . social',
 "great success , winning bronze `` agent demand ''",
 'leave overbook flight . get bodyslammed mf going jail .',
 'weekend , volunteer program brought visited pack meal ! volunteer joined former & amp ; bobbie howard . end visit , volunteer packed 5,000 pound food ! q',
 'p j r -- r rd n 1/30/23 wa flight ua510 & gt ; & amp ; l

In [14]:

# load model
model = models.load_model('models/models.h5')

# load tokenizer
with open('tokenizer/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

X_test_token = tokenizer.texts_to_sequences(X_pred)

X_test_pad = pad_sequences(X_test_token, dtype='float32', padding='post')

y_pred = model.predict(X_test_pad)

2023-03-11 14:02:22.319889: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-11 14:02:22.319931: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-11 14:02:22.319993: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (LAPTOP-2PJA6S06): /proc/driver/nvidia/version does not exist
2023-03-11 14:02:22.320301: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.




In [15]:
y_pred

array([[0.29182982],
       [0.57226753],
       [0.09552934],
       ...,
       [0.31137756],
       [0.99067503],
       [0.09553356]], dtype=float32)

In [16]:
df['pred'] = [1 if i[0] > 0.5 else 0 for i in y_pred]

In [17]:
df['clean_text'] = X_pred

In [18]:
df.to_csv('predicted_tweets.csv', index=False)

In [20]:
df = pd.read_csv('predicted_tweets.csv')

In [22]:
df['date'] = pd.to_datetime(df['date'])

In [33]:
df = df.groupby(df.date.dt.day)['pred'].mean()