In [1]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Building the model with pre-labelled data

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\smd_l\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stop_words = set(stopwords.words('english'))

## Prepared all function to be applied to data

### Importing data (csv files contain twits) and remove all columns except text column

In [4]:
def load_dataset(filename, cols):
    dataset = pd.read_csv(filename, encoding = 'latin-1')
    dataset.columns = cols
    return dataset

In [5]:
def remove_unwanted_cols(dataset, cols):
    for col in cols:
        del dataset[col]
    return dataset

### Pre-process text columns
  #### Lowercase all texts
  #### Remove noises: HTML, special characters, punctuation, white spaces
  #### Tokenizing: turn twits into tokens, tokens are words separated by space
  #### Remove stopwords in English: the, is, that, etc.

In [6]:
def preprocess_tweet_text(tweet):
    tweet.lower()
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#', '', tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    
    return " ".join(filtered_words)

### Vectorizing: converting tokens into number by using TfidfVectorizer
#### Compute word counts
#### Compute IDF values for each word
#### Compute Tf-idf score

In [7]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

### Train twits already labelled 0: negative, 2: neutral and 4: positive. So we will convert these number to human readable

In [8]:
def int_to_string(sentiment):
    if sentiment == 0:
        return 'negative'
    elif sentiment == 2:
        return 'Neutral'
    else:
        return 'Positive'

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\smd_l\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Loading training data and applying text pre-processing
 #### We will split the training data into Train and Test datasets with Test data at 20% total data
 #### We will train the model with Naive Bayes and Logistics Regression and pick the model with higher accuracy for actual twits

In [10]:
dataset = load_dataset('training.csv', ['target', 't_id', 'created_at', 'query', 'user', 'text'])

n_dataset = remove_unwanted_cols(dataset, ['t_id', 'created_at', 'query', 'user'])

dataset.text = dataset['text'].apply(preprocess_tweet_text)

tf_vector = get_feature_vector(np.array(dataset.iloc[:, 1]).ravel())
x = tf_vector.transform(np.array(dataset.iloc[:, 1]).ravel())
y = np.array(dataset.iloc[:, 0]).ravel()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=30)

NB_model = MultinomialNB()
NB_model.fit(x_train, y_train)
y_predict_nb = NB_model.predict(x_test)
print(accuracy_score(y_test, y_predict_nb))

LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(x_train, y_train)
y_predict_lr = LR_model.predict(x_test)
print(accuracy_score(y_test, y_predict_lr))

0.768521875
0.7877875




# Applying the LR model to actual twits about Palantir

In [13]:
test_file_name = 'palantir_ipo.csv'
test_ds = load_dataset(test_file_name, ['created_at', 'text'])
test_ds = remove_unwanted_cols(test_ds, ['created_at'])

test_ds.text = test_ds['text'].apply(preprocess_tweet_text)
test_feature = tf_vector.transform(np.array(test_ds.iloc[:, 0]).ravel())

test_prediction_lr = LR_model.predict(test_feature)

In [14]:
unique, counts = np.unique(test_prediction_lr, return_counts=True)
dict(zip(unique, counts))

{0: 2157, 4: 7532}

In [16]:
test_prediction_lr.size

9689