# Sentiment analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_train = pd.read_csv('data/train.csv')
df_valid = pd.read_csv('data/valid.csv')
df_test = pd.read_csv('data/test.csv')

In [3]:
print(df_train.head())

                                                text  label
0  I grew up (b. 1965) watching and loving the Th...      0
1  When I put this movie in my DVD player, and sa...      0
2  Why do people who do not know what a particula...      0
3  Even though I have great interest in Biblical ...      0
4  Im a die hard Dads Army fan and nothing will e...      1


In [4]:
print(df_valid.head())

                                                text  label
0  It's been about 14 years since Sharon Stone aw...      0
1  someone needed to make a car payment... this i...      0
2  The Guidelines state that a comment must conta...      0
3  This movie is a muddled mish-mash of clichés f...      0
4  Before Stan Laurel became the smaller half of ...      0


In [5]:
print(df_test.head())

                                                text  label
0  I always wrote this series off as being a comp...      0
1  1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...      0
2  This movie was so poorly written and directed ...      0
3  The most interesting thing about Miryang (Secr...      1
4  when i first read about "berlin am meer" i did...      0


# Preprocessing
* Change text to lower case
* Remove any urls
* Remove punctuation
* Remove stopwords

In [8]:
# lower case
df_train['text'] = df_train['text'].str.lower()
df_valid['text'] = df_valid['text'].str.lower()
df_test['text'] = df_test['text'].str.lower()

In [9]:
# remove URLS
import re

def remove_urls(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    
    return url.sub(r'', text)

df_train['text'] = df_train['text'].apply(remove_urls)
df_valid['text'] = df_valid['text'].apply(remove_urls)
df_test['text'] = df_test['text'].apply(remove_urls)

In [10]:
# removing punctuation
def remove_punctuation(text):
    text = text.replace('.', '')
    text = text.replace(',', '')
    text = text.replace('!', '')
    text = text.replace('?', '')
    text = text.replace('"', '')
    text = text.replace("'", '')
    text = text.replace(':', '')
    text = text.replace(';', '')
    # remove @? Might want to strip twitter usernames later
    return text

df_train['text'] = df_train['text'].apply(remove_punctuation)
df_valid['text'] = df_valid['text'].apply(remove_punctuation)
df_test['text'] = df_test['text'].apply(remove_punctuation)

In [11]:
# removing stop words
import nltk
from nltk.corpus import stopwords

def remove_stopwords(text):
    sw = stopwords.words('english')
    words = text.split(' ')
    filtered = [w for w in words if w not in sw]
    return ' '.join([str(v) for v in filtered])

df_train['text'] = df_train['text'].apply(remove_stopwords)
df_valid['text'] = df_valid['text'].apply(remove_stopwords)
df_test['text'] = df_test['text'].apply(remove_stopwords)

# DQ issues - missing data

In [12]:
print(df_train['text'].isna().sum())
print(df_valid['text'].isna().sum())
print(df_test['text'].isna().sum())

0
0
0


# Setup transformer model

In [13]:
# Loading the BERT Classifier and Tokenizer along with Input module
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")