In [2]:
import nltk

In [5]:
# split string into list of words

example_text = "Last night I had a dream about the girl who takes the train, she was all alone, the other ones had gone"
words = example_text.split()
words

['Last',
 'night',
 'I',
 'had',
 'a',
 'dream',
 'about',
 'the',
 'girl',
 'who',
 'takes',
 'the',
 'train,',
 'she',
 'was',
 'all',
 'alone,',
 'the',
 'other',
 'ones',
 'had',
 'gone']

In [6]:
# set all lower cases

lower_words = [w.lower() for w in words]
lower_words[0:10]

['last', 'night', 'i', 'had', 'a', 'dream', 'about', 'the', 'girl', 'who']

In [11]:
# remove stopwords
from nltk.corpus import stopwords as nltk_stopwords

stopwords = nltk_stopwords.words('english')


In [13]:
print(len(set(lower_words)))
useful_words = [word for word in lower_words if word not in stopwords]
print(len(set(useful_words)))
print("Before")
print(lower_words)
print("After")
print(useful_words)

19
9
Before
['last', 'night', 'i', 'had', 'a', 'dream', 'about', 'the', 'girl', 'who', 'takes', 'the', 'train,', 'she', 'was', 'all', 'alone,', 'the', 'other', 'ones', 'had', 'gone']
After
['last', 'night', 'dream', 'girl', 'takes', 'train,', 'alone,', 'ones', 'gone']


In [14]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')
stemmer.stem("having")

'have'

In [15]:
# map words to their stems

stemmed_words = [stemmer.stem(word) for word in useful_words]
print(useful_words[:10])
print(stemmed_words[:10])

['last', 'night', 'dream', 'girl', 'takes', 'train,', 'alone,', 'ones', 'gone']
['last', 'night', 'dream', 'girl', 'take', 'train,', 'alone,', 'one', 'gone']


In [22]:
from nltk import WordNetLemmatizer
nltk.download('wordnet')
lem = WordNetLemmatizer()
lem.lemmatize("having")

[nltk_data] Downloading package wordnet to /Users/paolo/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


'having'

In [24]:
lemmatised_words = [lem.lemmatize(word, 'v') for word in useful_words]

print("useful", useful_words[:10])
print("stemmed", stemmed_words[:10])
print("lemmatised", lemmatised_words[:10])

useful ['last', 'night', 'dream', 'girl', 'takes', 'train,', 'alone,', 'ones', 'gone']
stemmed ['last', 'night', 'dream', 'girl', 'take', 'train,', 'alone,', 'one', 'gone']
lemmatised ['last', 'night', 'dream', 'girl', 'take', 'train,', 'alone,', 'ones', 'go']


In [25]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(binary=True,
                      stop_words='english',
                      lowercase=True # default
                     )

In [26]:
X_train_text = vec.fit_transform(["I sit on a table I bought"])
X_test_text = vec.transform(["I sat on a table I will buy"])


In [28]:
print(X_train_text)

  (0, 0)	1
  (0, 2)	1
  (0, 1)	1


In [29]:
print(X_test_text)

  (0, 2)	1


In [41]:
import pandas as pd

fin_tweets = pd.read_csv("/Users/paolo/Downloads/financial-tweets/stockerbot-export.csv", error_bad_lines=False)

fin_tweets_cleaned = fin_tweets.dropna().copy()
print(fin_tweets.shape, fin_tweets_cleaned.shape)

(28264, 8) (21895, 8)


b'Skipping line 731: expected 8 fields, saw 13\nSkipping line 2836: expected 8 fields, saw 15\nSkipping line 3058: expected 8 fields, saw 12\nSkipping line 3113: expected 8 fields, saw 12\nSkipping line 3194: expected 8 fields, saw 17\nSkipping line 3205: expected 8 fields, saw 17\nSkipping line 3255: expected 8 fields, saw 17\nSkipping line 3520: expected 8 fields, saw 17\nSkipping line 4078: expected 8 fields, saw 17\nSkipping line 4087: expected 8 fields, saw 17\nSkipping line 4088: expected 8 fields, saw 17\nSkipping line 4499: expected 8 fields, saw 12\n'


In [42]:
fin_tweets_cleaned.columns

Index(['id', 'text', 'timestamp', 'source', 'symbols', 'company_names', 'url',
       'verified'],
      dtype='object')

In [43]:
tweets = fin_tweets_cleaned["text"]
lower_tweets = [w.lower() for w in tweets]
useful_tweets = [word for word in lower_tweets if word not in stopwords]
stemmed_tweets = [stemmer.stem(word) for word in useful_tweets]


In [46]:
from sklearn.model_selection import train_test_split

X = stemmed_tweets
y = fin_tweets_cleaned["company_names"]

# stratify keeps the proportions
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y.value_counts()

Discovery                             96
The Gap                               95
Twenty-First Century Fox              91
Dominion Energy                       89
HCP                                   88
Omisego                               88
WPX Energy                            87
Motorola Solutions                    86
United Parcel Service                 85
Essex Property Trust                  85
Ensco plc                             83
Loews Corporation                     82
Hilton Worldwide Holdings Inc.        82
BlackRock                             81
ONEOK                                 80
Mohawk Industries                     80
State Street Corporation              80
Consolidated Edison                   78
Thermo Fisher Scientific Inc.         77
The Hershey Company                   77
Hormel Foods Corporation              77
DTE Energy Company                    76
Microchip Technology Incorporated     76
Valero Energy Corporation             76
Southwestern Ene

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(binary=True,
                      stop_words='english',
                      lowercase=True # default
                     )

# starting from our 2860 documents we took for training set, we translate them into bag of words, 
# i.e. dictionaries of word count
X_train_text = vec.fit_transform(X_train)
X_test_text = vec.transform(X_test)



In [51]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_text, y_train)
y_pred = knn.predict(X_test_text)
accuracy_score(y_test, y_pred)


0.49992388491399