In [1]:
import pandas as pd
import sklearn

In [2]:
df = pd.read_csv("DS_group_project.csv")
df

Unnamed: 0,ID,REVIEW,RATING,AUTHOR,TITLE
0,1,I don't get all the terrible reviews for this ...,5.0,margarida-44311,Not Bad\n
1,2,I cannot believe anyone could give this film l...,8.0,joemay-2,What are all the bad reviews about is it a wo...
2,3,Great White is not the worst way to spend 90 m...,4.0,nebk,Great White=Jaws Lite\n
3,4,Great White is as basic of a killer shark film...,4.0,kuarinofu,Bare-bones killer shark film\n
4,5,"Terrible story, dialogue and CGI. The film has...",4.0,Horror_Flick_Fanatic,"Terrible story, dialogue, and CGI\n"
...,...,...,...,...,...
5445,5446,"It's master piece by Zack please part 2,3,4 al...",10.0,suryajijvania,More Parts\n
5446,5447,No words to describe. It's awesome. One of the...,10.0,shishirkmr-82243,It's a fantastic movie\n
5447,5448,Far better than previous one and better editin...,10.0,moizsyed-07601,Awesome out standing!\n
5448,5449,Why did the studio say no to this masterpiece?...,10.0,samun_shrestha,EPIC\n


In [None]:
import re
import string
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import defaultdict

def clean_text(text):
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove emojis (if applicable)
    # Add the code to remove emojis here
    # Convert to lowercase
    text = text.lower()
    return text


def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df["REVIEW"] = df["REVIEW"].apply(lambda x: remove_emojis(x))



df["REVIEW"] = df["REVIEW"].apply(lambda x: clean_text(x))
df["REVIEW"] = df["REVIEW"].apply(lambda x: word_tokenize(x))

stop_words = set(stopwords.words("english"))

def remove_stop_words(tokens):
    return [word for word in tokens if word not in stop_words]

df["REVIEW"] = df["REVIEW"].apply(lambda x: remove_stop_words(x))

def build_vocabulary(reviews):
    vocabulary = defaultdict(int)
    for review in reviews:
        for word in review:
            vocabulary[word] += 1
    return vocabulary

vocabulary = build_vocabulary(df["REVIEW"])

def review_to_bow(review, vocabulary):
    bow = defaultdict(int)
    for word in review:
        if word in vocabulary:
            bow[word] += 1
    return bow

df["REVIEW"] = df["REVIEW"].apply(lambda x: review_to_bow(x, vocabulary))

import numpy as np

# replace NaN values with 0
df.fillna(value=0, inplace=True)

# replace infinite values with a large finite value
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(value=1e10, inplace=True)

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# convert bag of words to a numerical representation using DictVectorizer
vectorizer = DictVectorizer()
X = vectorizer.fit_transform(df["REVIEW"])
y = df["RATING"]

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# train the logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# train a naive bayes model on the training data
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train , y_train)

# evaluate the model on the test set
accuracy = clf.score(X_test, y_test)
naivebayesaccuracy = naive_bayes.score(X_test, y_test)
print("Logistic Regression Accuracy: ", accuracy,"\nNaive Bayes Accuracy: ",naivebayesaccuracy)

In [3]:
# rosie's models: linear regression, single perceptron and multilayer perceptron net

from sklearn import metrics
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import time

# making a binary sentiment column so that the Perceptron can process the input

def sentiment_processor_binary(sentiment_list, midpoint):
    binary_sentiment = []
    for item in sentiment_list:
        if item >= midpoint:
            binary_sentiment.append(1)
        else:
            binary_sentiment.append(0)
    return binary_sentiment

df["BINARY_SENTIMENT"] = sentiment_processor_binary(df["RATING"], 5)

# splitting the dataset

train_features, test_features, train_targets, test_targets = train_test_split(df["REVIEW"], df["BINARY_SENTIMENT"], 
                                                                              test_size = 0.1, random_state=156)

# turning the reviews into a tf-idf array

# vectorise and remove stopwords

vectoriser = TfidfVectorizer(stop_words="english", lowercase=True, norm="l1")

# run on training and testing reviews

train_features = vectoriser.fit_transform(train_features)
test_features = vectoriser.transform(test_features)

# building a basic Perceptron and testing it, printing the accuracy.

reviews_perceptron = Perceptron()
reviews_perceptron.fit(train_features, train_targets)
accuracy = reviews_perceptron.score(test_features, test_targets)

print(f"The accuracy for a regular Perceptron is {accuracy}.")

# building a multilayer neural net and testing it, printing accuracy.

# we will need to create several different models so I am creating a function for max efficiency.

print("The next results are relevant to the multilayer Perceptron neural net.")

def create_neural_net(train_features, train_targets, test_features, test_targets, talking = False, 
                      neurons = 2, iterations = 200):
    tick = time.perf_counter()
    neural_net = MLPClassifier(hidden_layer_sizes = neurons, max_iter = iterations, activation = "relu", 
                               solver = "sgd", random_state = 800, learning_rate = "adaptive", verbose = talking)
    neural_net.fit(train_features, train_targets)
    tock = time.perf_counter()
    print(f"Training finished in {tock - tick:0.4f} seconds")
    neural_net_predictions = neural_net.predict(test_features)
    neural_net_accuracy = metrics.accuracy_score(test_targets, neural_net_predictions)

    print(f"The accuracy for {neurons} neurons per hidden layer and {iterations} max iterations is" +
          f" {neural_net_accuracy}.")
    
# testing out another neural net as the "adam" solver is good for large datasets with thousands of entries.

def create_adam(train_features, train_targets, test_features, test_targets, talking = False, 
                neurons = 2, iterations = 200):
    tick = time.perf_counter()
    neural_net = MLPClassifier(hidden_layer_sizes = neurons, max_iter = iterations, activation = "relu", 
                               solver = "adam", random_state = 800, verbose = talking)
    neural_net.fit(train_features, train_targets)
    tock = time.perf_counter()
    print(f"Training finished in {tock - tick:0.4f} seconds")
    neural_net_predictions = neural_net.predict(test_features)
    neural_net_accuracy = metrics.accuracy_score(test_targets, neural_net_predictions)
    
    print(f"The accuracy for {neurons} neurons per hidden layer and {iterations} max iterations is" +
          f" {neural_net_accuracy}.")
    
    return neural_net
    
# creating a simple neural net with default values

create_neural_net(train_features, train_targets, test_features, test_targets)

# creating a more complex neural net

create_adam(train_features, train_targets, test_features, test_targets, False, 30)

The accuracy for a regular Perceptron is 0.8.
The next results are relevant to the multilayer Perceptron neural net.
Training finished in 3.6559 seconds
The accuracy for 2 neurons per hidden layer and 200 max iterations is 0.5302752293577981.
Training finished in 67.5765 seconds
The accuracy for 30 neurons per hidden layer and 200 max iterations is 0.8330275229357799.


MLPClassifier(hidden_layer_sizes=30, random_state=800)

In [4]:
twitter_data = pd.read_csv("training.1600000.processed.noemoticon.csv", header = None)
twitter_data.set_axis(["SENTIMENT", "ID", "DATE", "QUERY", "USERNAME", "TEXT"], axis=1, inplace=True)

twitter_data["BINARY_SENTIMENT"] = sentiment_processor_binary(twitter_data["SENTIMENT"], 2)

twitter_data

Unnamed: 0,SENTIMENT,ID,DATE,QUERY,USERNAME,TEXT,BINARY_SENTIMENT
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,0
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,0
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,0
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",0
...,...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...,1
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,1
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,1
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,1


In [5]:
import re
import string

def clean_tweets(twitter_data):
    print("debug: tweet cleaning begins! huge dataset, will take a while")
    cleaned_twitter_data = []
    tick = time.perf_counter()
    for tweet in twitter_data:
        tweet = re.sub("@\S+", " ", tweet)
        tweet = re.sub("https*\S+", " ", tweet)
        tweet = re.sub("#\S+", " ", tweet)
        tweet = re.sub("\'\w+", '', tweet)
        tweet = re.sub('[%s]' % re.escape(string.punctuation), ' ', tweet)
        tweet = re.sub(r'\w*\d+\w*', '', tweet)
        tweet = re.sub('\s{2,}', " ", tweet)
        cleaned_twitter_data.append(tweet)
    tock = time.perf_counter()
    print(f"Tweet cleaning finished in {tock - tick:0.4f} seconds")
    return cleaned_twitter_data

twitter_test = vectoriser.transform(clean_tweets(twitter_data["TEXT"]))

debug: tweet cleaning begins! huge dataset, will take a while
Tweet cleaning finished in 45.1251 seconds


In [9]:
print("The next statements refer to use of the neural net on the twitter dataset.")

print("Fresh neural net trained and tested on iMDB dataset.")

neural_adam = create_adam(train_features, train_targets, test_features, test_targets, True, 30)

neural_adam_predictions = neural_adam.predict(twitter_test)
neural_adam_accuracy = metrics.accuracy_score(twitter_data["BINARY_SENTIMENT"], neural_adam_predictions)

print(f"The accuracy of the model when applied to the twitter dataset is {neural_adam_accuracy}")

The next statements refer to use of the neural net on the twitter dataset.
Fresh neural net trained and tested on iMDB dataset.
Iteration 1, loss = 0.71162570
Iteration 2, loss = 0.69229549
Iteration 3, loss = 0.67128990
Iteration 4, loss = 0.64964658
Iteration 5, loss = 0.62615624
Iteration 6, loss = 0.60142724
Iteration 7, loss = 0.57472329
Iteration 8, loss = 0.54697630
Iteration 9, loss = 0.51875070
Iteration 10, loss = 0.49071799
Iteration 11, loss = 0.46343046
Iteration 12, loss = 0.43731391
Iteration 13, loss = 0.41247794
Iteration 14, loss = 0.38907122
Iteration 15, loss = 0.36745111
Iteration 16, loss = 0.34738398
Iteration 17, loss = 0.32874921
Iteration 18, loss = 0.31167624
Iteration 19, loss = 0.29574439
Iteration 20, loss = 0.28116705
Iteration 21, loss = 0.26768997
Iteration 22, loss = 0.25518054
Iteration 23, loss = 0.24347841
Iteration 24, loss = 0.23272307
Iteration 25, loss = 0.22263733
Iteration 26, loss = 0.21317015
Iteration 27, loss = 0.20438766
Iteration 28, los