In [None]:
### The following is a model that aims to predict the direction of the movement of the price of a dogecoin based on Elon Musk's tweets ###

In [1]:
# Loading libraries, getting resources:
import pandas as pd
import nltk
import random
import csv
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.probability import FreqDist

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kenez\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Data initialization:
# We expect the user of the model to know their dataset well, the code itself does not have data preparative features
# In our case the raw data was processed using mostly pandas and excel features, we felt like the code needn't be 
# included here as it is not remotely reproducible anyways
data = pd.read_csv('data_19_22.csv', delimiter = ',')

In [3]:
# Creating the functions that classify the data

# We create the function that makes a list of lists of words of the tweets 

def into_words(tweetcolumn):
    wordlist = []
    for tweet_sentence in tweetcolumn:
            tweet_sentence_string_lower = str(tweet_sentence).lower()
            tweet_words = tweet_sentence_string_lower.split()
            wordlist.append(tweet_words)
    return(wordlist)

# We create the function that removes stopwords from the list, thus creating a stopword-free list of words

def filterstopwords(wordlist):
    filtered_wordlist = []
    stop_words = list(stopwords.words('english'))
    for lists in wordlist:
        for words in lists:
            if  words.lower() not in stop_words:
                
                filtered_wordlist.append(words.lower())
    return(filtered_wordlist)

# We create the function that makes a list of the top 100 words used by Elon Musk in his tweetings

def top5kwordizer(filtered_wordlist):
    all_words = nltk.FreqDist(filtered_wordlist)
    all_features = list(all_words)[:2500]
    return all_features

# We create the function that makes the list of lists that contain dictionaries with the features

def list_of_dicts(tweet_sentence, all_features): 
    tweet_words = set(tweet_sentence)
    features = {}
    for word in all_features:
        if word in tweet_words:
            features['contains({})'.format(word)] = True
        else:
            features['contains({})'.format(word)] = False
    return features

In [5]:
### Using the functions to go from our dataset to a classified set of data

# Running the functions one by one to classify the data
wordized_list = into_words(data['tweet']) # we have the list of lists of words
filteredwords = filterstopwords(wordized_list) # we have the list of all words filtered free of stopwords
top2500words = top5kwordizer(filteredwords) # we have the top100 words from the list above
dictslist = list() # we get the list of list of dictionaries containing the features and if theyre in the tweet or not
for tweet in wordized_list: 
        dictslist.append(list_of_dicts(tweet, top2500words))
        
# Using pandas's very nice dataframe feature to build our classified dataset

workingdataframe = pd.DataFrame()
workingdataframe['features'] = (pd.Series(list(dictslist)))
workingdataframe['direction'] = data['direction']

# Creating the list of lists that the model will train on

finaldata = workingdataframe.values.tolist()

In [6]:
# Training the model

train_set = finaldata[:8000]
test_set = finaldata[8000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
acc = nltk.classify.accuracy(classifier, test_set)
print("The accuracy of the model is : ", acc)
classifier.show_most_informative_features(5)

The accuracy of the model is :  0.47941787941787944
Most Informative Features
     contains(@joebiden) = True                1 : 0      =     12.7 : 1.0
     contains(@f9block5) = True                0 : 1      =      8.7 : 1.0
    contains(@sirineati) = True                0 : 1      =      8.1 : 1.0
     contains(@sjvtesla) = True                1 : 0      =      7.9 : 1.0
  contains(@martiandays) = True                0 : 1      =      7.4 : 1.0


In [None]:
### Hope you enjoyed using the model, good luck in life!