In [1]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Building the model with pre-labelled data

### Training data was downloaded from here: https://www.dropbox.com/s/du1z2m910a68ehk/training.csv?dl=0

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smd_l\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
stop_words = set(stopwords.words('english'))

In [5]:
def load_dataset(filename, cols):
    dataset = pd.read_csv(filename, encoding = 'latin-1')
    dataset.columns = cols
    return dataset

In [6]:
def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

In [7]:
def preprocess_tweet_text(tweet):
    tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#', '', tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    return " ".join(filtered_words)

In [13]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [9]:
def int_to_string(sentiment):
    if sentiment == 0:
        return 'negative'
    elif sentiment == 2:
        return 'Neutral'
    else:
        return 'Positive'

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smd_l\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [None]:
dataset = load_dataset('training.csv', ['target', 't_id', 'created_at', 'query', 'user', 'text'])

n_dataset = remove_unwanted_cols(dataset, ['t_id', 'created_at', 'query', 'user'])

dataset.text = dataset['text'].apply(preprocess_tweet_text)

tf_vector = get_feature_vector(np.array(dataset.iloc[:, 1]).ravel())
x = tf_vector.transform(np.array(dataset.iloc[:, 1]).ravel())
y = np.array(dataset.iloc[:, 0]).ravel()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=30)

NB_model = MultinomialNB()
NB_model.fit(x_train, y_train)
y_predict_nb = NB_model.predict(x_test)
print(accuracy_score(y_test, y_predict_nb))

LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(x_train, y_train)
y_predict_lr = LR_model.predict(x_test)
print(accuracy_score(y_test, y_predict_lr))

# Use the model on downloaded twits 
## Use get_twits notebook to download a particular topic by changing the query. 

In [40]:
test_file_name = 'palantir_ipo.csv'
test_ds = load_dataset(test_file_name, ['created_at', 'text'])
test_ds = remove_unwanted_cols(test_ds, ['created_at'])

test_ds.text = test_ds['text'].apply(preprocess_tweet_text)
test_feature = tf_vector.transform(np.array(test_ds.iloc[:, 0]).ravel())

test_prediction_lr = LR_model.predict(test_feature)

In [41]:
unique, counts = np.unique(test_prediction_lr, return_counts=True)
dict(zip(unique, counts))

{0: 2152, 4: 7505}

In [49]:
test_prediction_lr.size

9657

### What has been showed from this result is that there are a lot of positive sentiment about the company within the last 10 days or so. The company is expecting to have an IPO this month, should these sentiment translate to a big swing upside once the stock becomes available for trading? 