In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
import gensim
import time
import pickle
cd = 'C:\\Users\\Giada\\Documents\\GitHub\\Tweet-Sentiment-Predictor\\'

In [2]:
#Loops each word through the list of categories and assigns the word the most similar category
def category_chooser(word,categories,w2vmodel):
    most_similar_category = len(categories)-1
    #best similarity chosen as a minimum cosine similarity needed for another category to be chosen
    best_similarity = 0.6
    for i in range(len(categories)-1):
        temp_similarity = w2vmodel.wv.similarity(word,categories[i])
        if temp_similarity > best_similarity: 
            best_similarity = temp_similarity
            most_similar_category = i
    return most_similar_category

In [3]:
def model_predictions(dataset,pos_model,neg_model,positive_target,negative_target):
    #Calculates the predictions 
    pos_probs = pos_model.predict_proba(dataset)[:,1]
    neg_probs = neg_model.predict_proba(dataset)[:,1]
    return pos_probs,neg_probs

#### Treating the data into a format that can be scored

In [4]:
#pulling the stop words from nltk
stopwords = list(set(stopwords.words('english')))
replacement_values = re.compile('[^a-zA-Z]') #Removing all non-alphanumeric characters from stop words

#Reading a long list of categories from the dataset catagory file
with open('Dataset_Categories.txt','r') as file:
    categories = file.read().splitlines()
file.close()

#Pulling the test dataset 
test_data = pd.read_csv("kaggle_train.csv",index_col="textID")
test_target = list(test_data["sentiment"])

#unpickling the models
with open('pos_model.sav', 'rb') as pickle_file:
    pos_model = pickle.load(pickle_file)
with open('neg_model.sav', 'rb') as pickle_file:
    neg_model = pickle.load(pickle_file)

dataset_final = pd.DataFrame(columns=categories,index=range(len(test_data)))
w2vmodel = gensim.models.KeyedVectors.load('w2vmodel')

for sentence in range(len(test_data)):
    split_text = test_data["text"][sentence].split()
    for tweet_number in range(len(split_text)):
        #Removing all words that start with an @ or # as these are usernames and often several combined words as a hashtag
        if split_text[tweet_number][:1] == '@':
            split_text[tweet_number] = ''
        if split_text[tweet_number][:1] == '#':
            split_text[tweet_number] = ''

        #Removing non-alphabetical characters
        split_text[tweet_number] = replacement_values.sub('', split_text[tweet_number])

        #Removing weblinks which will start with http
        if split_text[tweet_number][:4] == 'http':
            split_text[tweet_number] = ''

        #Making all words lowercase
        split_text[tweet_number] = split_text[tweet_number].lower()

        #Removing stop words
        if split_text[tweet_number] in stopwords:
            split_text[tweet_number] = ''

    #Removing missing values left behind by strings of only non-alphabetical characters
    split_text = list(filter(None, split_text))

    #Setting all initial values to 0 in the final dset
    dataset_final.iloc[sentence,:] = 0
    #Loop through all of the words in the tweet
    for word_number in range(len(split_text)):
        word = split_text[word_number]
        #If statement to check the words are in the model vocab
        if word is not np.nan and word in w2vmodel.wv.vocab.keys():
            category_number = category_chooser(word=word,categories=categories,w2vmodel=w2vmodel)
            dataset_final.iloc[sentence,category_number] = dataset_final.iloc[sentence,category_number] + 1

modelling_dset.to_csv('modelling_kaggle_dset_train.csv',index=True)

#### Scoring up the dataset

In [5]:
test_target = list(test_data["sentiment"])

#Positive target list
test_target_pos = []
for target in range(len(test_target)):
    if test_target[target] == "positive":
        test_target_pos.append(1)
    else:
        test_target_pos.append(0)

#Negative target list
test_target_neg = []
for target in range(len(test_target)):
    if test_target[target] == "negative":
        test_target_neg.append(1)
    else:
        test_target_neg.append(0)
        

pos_probs,neg_probs = model_predictions(dataset=modelling_dset
                        ,pos_model=pos_model
                        ,neg_model=neg_model
                        ,positive_target=test_target_pos
                        ,negative_target=test_target_neg)

test_data["pos_probs"] = pos_probs
test_data["neg_probs"] = neg_probs

#### Cycling through negative and positive cut-offs to find the pair of cut-offs which produces the most accurate predictions

In [6]:
#Loop through the best cut-offs to find the most accurate cutoffs
best_accuracy = 0
for pos_cutoff_x in range(35):
    pos_cutoff = 0.3 + 0.01*pos_cutoff_x
    for neg_cutoff_x in range(35):
        neg_cutoff = 0.3 + 0.01*neg_cutoff_x

        test_data.loc[(test_data["pos_probs"] > pos_cutoff) & (test_data["neg_probs"] > neg_cutoff), "OUTCOME"] = "BOTH"
        test_data.loc[(test_data["pos_probs"] > pos_cutoff) & (test_data["neg_probs"] < neg_cutoff), "OUTCOME"] = "positive"
        test_data.loc[(test_data["pos_probs"] < pos_cutoff) & (test_data["neg_probs"] > neg_cutoff), "OUTCOME"] = "negative"
        test_data.loc[(test_data["pos_probs"] < pos_cutoff) & (test_data["neg_probs"] < neg_cutoff), "OUTCOME"] = "neutral"

        test_data.loc[(test_data["OUTCOME"] == "BOTH") & (test_data["pos_probs"] > test_data["neg_probs"]), "OUTCOME"] = "positive"
        test_data.loc[(test_data["OUTCOME"] == "BOTH") & (test_data["pos_probs"] < test_data["neg_probs"]), "OUTCOME"] = "negative"

        test_data.loc[test_data["OUTCOME"] == test_data["sentiment"], "ACCURACY"] = 1
        test_data.loc[test_data["OUTCOME"] != test_data["sentiment"], "ACCURACY"] = 0

        #Calculating the accuracy of the prediction
        total_correct_predictions = test_data["ACCURACY"].sum()
        accuracy = total_correct_predictions/len(test_data)
        if  best_accuracy < accuracy:
            best_accuracy = accuracy
            P_CO = pos_cutoff
            N_CO = neg_cutoff

print("Positive cutoff " + str(P_CO))
print("Negative cutoff " + str(N_CO))
print("Best accuracy " + str(best_accuracy))

Positive cutoff 0.39
Negative cutoff 0.5700000000000001
Best accuracy 0.544708545557442
