# Machine Learning Challenge
### Course Machine Learning for Natural Language Understanding
#### Instructors: Prof. Achim Rettinger, M.A. Kai Kugler

In [1]:
import pandas as pd
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support
import pickle

2023-01-25 18:04:20.047365: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-25 18:04:20.199655: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-25 18:04:20.199713: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-25 18:04:20.824338: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [2]:
# load spacy language model for lemmatization
#!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
# code from class exercise notebook 08
import nltk
try:
    nltk.data.find('stopwords')
except LookupError:
    nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### --- Read training data ---

In [3]:
def read_data(file1, file2):
    with open(file1) as f:
        clickbait_yes = [line.rstrip() for line in f]
        df_yes = pd.DataFrame({"headline": clickbait_yes, "clickbait": "yes", "label": 1})
    with open(file2) as f:
        clickbait_no = [line.rstrip() for line in f]
        df_no = pd.DataFrame({"headline": clickbait_no, "clickbait": "no", "label": 0})

    return pd.concat([df_yes,df_no])

### --- Pre-Processing ---

##### Text Cleaning: lowercase, remove tags, remove punctuation, ...

In [4]:
def clean(df):
    df["headline"] = df["headline"].str.lower()
    df["headline"] =  df["headline"].str.replace(r'<[^>]*>','',regex=True)
    df["headline"] =  df["headline"].str.replace(r'[^a-zA-Z ]','',regex=True)
    
    return df


##### Tokenization, stopword removal, lemmatization

In [5]:
def tokenize(sentence):
    tokenized_sent = word_tokenize(sentence)
    lemmatized_sent = [word.lemma_ for word in nlp(" ".join(tokenized_sent))]
    return lemmatized_sent

### ---TF-IDF Vectorizer---

In [6]:
def vectorize(train, test):
    vectorizer = TfidfVectorizer(tokenizer=tokenize, token_pattern=None)
    vectorizer.fit(train)
    
    train = vectorizer.transform(train)
    test = vectorizer.transform(test)
    
    pickle.dump(vectorizer, open("tfidf.pkl", "wb"))
    return train, test

### ---Classifier---

In [7]:
def NB_model(X, y):
    model=MultinomialNB(alpha=0.5) 
    model.fit(X,y)
    pickle.dump(model, open('NB_model.pkl', 'wb'))
    return model

### ---Evaluation---

In [8]:
def evaluate(y_pred, y_test):
    return precision_recall_fscore_support(y_test, y_pred, average='macro')

# Build the classifier

In [9]:
def build_classifier(file1, file2):
    df = read_data('clickbait_yes', 'clickbait_no')
    
    df = clean(df)
    
    y = df['label']  # label 0 = no, 1 =yes clickbait
    X = df["headline"]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15, shuffle=True)
    
    X_train, X_test = vectorize(X_train, X_test)
    
    bayes = NB_model(X_train,y_train)
    
    y_pred = bayes.predict(X_test)
    validation_evaluation = evaluate(y_pred, y_test)
    return validation_evaluation

### ---Use test data with model---

In [27]:
def classify_test_out_data(file):
    df = pd.read_csv(file, sep=";")
    df = clean(df)
    X = df["headline"]
    
    #load vectorizer
    vectorizer = pickle.load(open('tfidf.pkl', 'rb'))
    X = vectorizer.transform(X)
    
    #load model
    model = pickle.load(open('NB_model.pkl', 'rb'))
    predictions = model.predict(X)
    
    return predictions

## Validation Set Evaluation result (F1, precision, recall, _)

In [11]:
build_classifier("clickbait_no.txt", "clickbait_yes.txt")

(0.9691862845840641, 0.9686187544733202, 0.9687335797011452, None)

## Test Set 

In [29]:
predictions = classify_test_out_data("clickbait_hold_X.csv")
with open("predictions.txt", "w") as f:
    for pred in predictions:
        f.write(str(pred) + "\n")
    