In [None]:
import pandas as pd
import sklearn

In [None]:
df = pd.read_csv("DS_group_project.csv")
df

In [None]:
import re
import string
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import defaultdict

def clean_text(text):
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove emojis (if applicable)
    # Add the code to remove emojis here
    # Convert to lowercase
    text = text.lower()
    return text


def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df["REVIEW"] = df["REVIEW"].apply(lambda x: remove_emojis(x))



df["REVIEW"] = df["REVIEW"].apply(lambda x: clean_text(x))
df["REVIEW"] = df["REVIEW"].apply(lambda x: word_tokenize(x))

stop_words = set(stopwords.words("english"))

def remove_stop_words(tokens):
    return [word for word in tokens if word not in stop_words]

df["REVIEW"] = df["REVIEW"].apply(lambda x: remove_stop_words(x))

def build_vocabulary(reviews):
    vocabulary = defaultdict(int)
    for review in reviews:
        for word in review:
            vocabulary[word] += 1
    return vocabulary

vocabulary = build_vocabulary(df["REVIEW"])

def review_to_bow(review, vocabulary):
    bow = defaultdict(int)
    for word in review:
        if word in vocabulary:
            bow[word] += 1
    return bow

df["REVIEW"] = df["REVIEW"].apply(lambda x: review_to_bow(x, vocabulary))

import numpy as np

# replace NaN values with 0
df.fillna(value=0, inplace=True)

# replace infinite values with a large finite value
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(value=1e10, inplace=True)

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# convert bag of words to a numerical representation using DictVectorizer
vectorizer = DictVectorizer()
X = vectorizer.fit_transform(df["REVIEW"])
y = df["RATING"]

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# train the logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# train a naive bayes model on the training data
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train , y_train)

# evaluate the model on the test set
accuracy = clf.score(X_test, y_test)
naivebayesaccuracy = naive_bayes.score(X_test, y_test)
print("Logistic Regression Accuracy: ", accuracy,"\nNaive Bayes Accuracy: ",naivebayesaccuracy)

In [None]:
# rosie's models: linear regression, single perceptron and multilayer perceptron net

from sklearn import metrics
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# making a binary sentiment column so that the Perceptron can process the input

def sentiment_processor_binary(sentiment_list):
    binary_sentiment = []
    for item in sentiment_list:
        if item >= 5:
            binary_sentiment.append(1)
        else:
            binary_sentiment.append(0)
    return binary_sentiment

df["BINARY_SENTIMENT"] = sentiment_processor_binary(df["RATING"])

# splitting the dataset

train_features, test_features, train_targets, test_targets = train_test_split(df["REVIEW"], df["BINARY_SENTIMENT"], 
                                                                              test_size = 0.1, random_state=156)

# turning the reviews into a tf-idf array

# vectorise and remove stopwords

vectoriser = TfidfVectorizer(stop_words="english", lowercase=True, norm="l1")

# run on training and testing reviews

train_features = vectoriser.fit_transform(train_features)
test_features = vectoriser.transform(test_features)

# building a basic Perceptron and testing it, printing the accuracy.

reviews_perceptron = Perceptron()
reviews_perceptron.fit(train_features, train_targets)
accuracy = reviews_perceptron.score(test_features, test_targets)

print(f"The accuracy for a regular Perceptron is {accuracy}.")

# building a multilayer neural net and testing it, printing accuracy.

# we will need to create several different models so I am creating a function for max efficiency.

print("The next results are relevant to the multilayer Perceptron neural net.")

def create_neural_net(train_features, train_targets, test_features, test_targets, talking = False, 
                      neurons = 2, iterations = 200):
    neural_net = MLPClassifier(hidden_layer_sizes = neurons, max_iter = iterations, activation = "relu", 
                               solver = "sgd", random_state = 800, learning_rate = "adaptive", verbose = talking)
    neural_net.fit(train_features, train_targets)
    neural_net_predictions = neural_net.predict(test_features)
    neural_net_accuracy = metrics.accuracy_score(test_targets, neural_net_predictions)

    print(f"The accuracy for {neurons} neurons per hidden layer and {iterations} max iterations is" +
          f" {neural_net_accuracy}.")
    
# testing out another neural net as the "adam" solver is good for large datasets with thousands of entries.

def create_adam(train_features, train_targets, test_features, test_targets, talking = False, 
                neurons = 2, iterations = 200):
    neural_net = MLPClassifier(hidden_layer_sizes = neurons, max_iter = iterations, activation = "relu", 
                               solver = "adam", random_state = 800, verbose = talking)
    neural_net.fit(train_features, train_targets)
    neural_net_predictions = neural_net.predict(test_features)
    neural_net_accuracy = metrics.accuracy_score(test_targets, neural_net_predictions)
    
    print(f"The accuracy for {neurons} neurons per hidden layer and {iterations} max iterations is" +
          f" {neural_net_accuracy}.")
    
# creating a simple neural net with default values

create_neural_net(train_features, train_targets, test_features, test_targets)

# creating a more complex neural net

create_adam(train_features, train_targets, test_features, test_targets, True, 30)
