In [None]:
!pip install tensorflow_text # Restart the colab again after installing

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")
from sklearn.metrics import classification_report
import nltk
nltk.download('punkt')
import Preprocessing as pre
import Data_Exploratory as exp

In [None]:
class supervised():

    def __init__(self, filename):
        self.name = filename
        self.df = pd.read_csv(filename+".csv")
        self.X_train = []
        self.X_val = []
        self.y_train = []
        self.y_val = []
        self.model = 0
        self.train_df = pd.DataFrame()
        self.test_df = pd.DataFrame()

    def get_dataframe(self):
        return self.df
    
    def display(self):
        display(self.df.head())

    def downsample(self):
        self.df.drop = self.df.dropna(axis=0, subset=['target'])
        print("ORIGINAL SIZE of DATAFRAME IS ",len(self.df))
        trump_df = self.df[self.df['target'] == 'trump']
        biden_df = self.df[self.df['target'] == 'biden']
        df_biden_downsampled = biden_df.sample(trump_df.shape[0])
        self.df  = pd.concat([df_biden_downsampled, trump_df])
        print("DOWNSAMPLED THE DATAFRAME TO EQUAL SIZES")
        display(self.df['target'].value_counts())

    def split_data(self):
        self.df['vote'] = self.df['target'].apply(lambda x: 1 if x=='trump' else 0)
        X = self.df['text']
        y = self.df['vote']
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(X,y, stratify=y)
        print("Length of X_train",len(self.X_train))
        print("Length of y_train",len(self.y_train))
        print("Length of X_val",len(self.X_val))
        print("Length of y_val",len(self.y_val))

    def neural_network(self):
        text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
        preprocessed_text = bert_preprocess(text_input)
        outputs = bert_encoder(preprocessed_text)
        l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
        l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)
        self.model = tf.keras.Model(inputs=[text_input], outputs = [l])
        display(self.model.summary())

    def metrics(self):
        METRICS = [
            tf.keras.metrics.BinaryAccuracy(name='accuracy'),
            tf.keras.metrics.Precision(name='precision'),
            tf.keras.metrics.Recall(name='recall')
            ]
        self.model.compile(optimizer='adam',
        loss='binary_crossentropy',
        metrics=METRICS)

    def training(self):
        self.metrics()
        self.model.fit(self.X_train, self.y_train, epochs=10)
  
    def predict_validation(self):
        y_predicted = self.model.predict(self.X_val)
        y_predicted = y_predicted.flatten()
        y_predicted = np.where(y_predicted > 0.5, 1, 0)
        val_df = pd.DataFrame(data={"Doc":self.X_val,"target":self.y_val,"predict_validation":y_predicted})
        print(classification_report(self.y_val, y_predicted))
        val_df.to_csv("validation_predicted_df.csv")
  
    def testing(self, filename):
        self.test_df = pd.read_csv(filename+".csv")
        obj = pre.preprocessing(filename)
        obj.remove_unicode()
        obj.lowercase()
        obj.negation_words()
        obj.filter_rules()
        df = obj.get_dataframe()
        display(df)
        df.to_csv("preprocess_test_"+filename+".csv")
        obj2 = exp.data_exploratory("preprocess_test_"+filename)
        obj2.pre_election(True)
        obj2.trim_document(20)
        df = obj2.get_dataframe()
        y_predicted = self.model.predict(df['text'])
        y_predicted = y_predicted.flatten()
        y_predicted = np.where(y_predicted > 0.5, 1, 0)
        labeled_test_df = pd.DataFrame(data={"Doc":df['text'],"predict_testing":y_predicted})
        labeled_test_df.to_csv("testing_predicted_df.csv")

In [None]:
obj = supervised("labeled_votes")
#obj.get_dataframe()
obj.downsample()
obj.split_data()

In [None]:
obj.neural_network()

In [None]:
obj.training()

In [None]:
obj.predict_validation()

In [None]:
obj.testing("Forecasting_MSNBC")