In [17]:
import pandas as pd
import re 
from collections import defaultdict

# NLTK
from nltk.corpus import stopwords

# Fasttext utils: https://fasttext.cc/docs/en/crawl-vectors.html
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English


'cc.en.300.bin'

In [35]:
class TrainTestStockPriceDatasetClass:
    def __init__(self):
        self.path = './data/tesla_3_months_news_stock_price_data_3_days_1_delta.csv'
        self.raw_data = None
        self.ft = fasttext.load_model('cc.en.300.bin')
        self.min_word_count_required = 1
        self.max_word_count_acceptable = 4
        
    def loadRawData(self):
        self.raw_data = pd.read_csv(self.path)
        return self.raw_data
    
    
    def cleanRawData(self):
        for index, row in self.raw_data.iterrows():
            self.raw_data['description'][index], word_count_list = self.cleanTextUsingNLTK(row['description'])
            print("\n\n see this -> ", self.raw_data['description'][index], word_count_list)
            print("\n The result -> ", self.raw_data['price_delta_of_stock'][index])
            
    def cleanTextUsingNLTK(self, text):
        text = [word for word in re.split(r'\W+',text) if word.isalpha()]
        clean_text = [word.lower() for word in text if word not in set(stopwords.words('english'))]
        word_count_list = self.findWordCount(clean_text)
        clean_text, word_count_list = self.filterListWithWordCount(clean_text, word_count_list)
        return clean_text, word_count_list
    
    def findWordCount(self, sentence_in_list_type):
        unique_word_list_dict = defaultdict(int)
        for word in sentence_in_list_type:
            unique_word_list_dict[word] += 1
        return unique_word_list_dict
    
    def filterListWithWordCount(self, sentence_in_list_type, word_count_list):
        for word, word_prescence_count in word_count_list.items():
            if word_prescence_count <= self.min_word_count_required or word_prescence_count >= self.max_word_count_acceptable:
                sentence_in_list_type = [word_from_sentence for word_from_sentence in sentence_in_list_type if word_from_sentence != word]
        return sentence_in_list_type, {word: word_count_list[word] for word in word_count_list if word_count_list[word] > self.min_word_count_required and word_count_list[word] < self.max_word_count_acceptable}
        
    

In [36]:
train_test_stock_price_dataset_class = TrainTestStockPriceDatasetClass()
data = train_test_stock_price_dataset_class.loadRawData()
train_test_stock_price_dataset_class.cleanRawData()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.raw_data['description'][index], word_count_list = self.cleanTextUsingNLTK(row['description'])




 see this ->  ['shares', 'inc', 'friday', 'general', 'motors', 'for', 'gm', 'gm', 'inc', 'forecast', 'com', 'forecast', 'forecast', 'warren', 'buffett', 'invested', 'billion', 'com', 'warren', 'buffett', 'invested', 'billion', 'since', 'action', 'shares', 'friday', 'benzinga', 'action', 'the', 'stocks', 'zoom', 'stocks', 'zoom', 'shares', 'the', 'general', 'motors', 'warren', 'general', 'motors', 'earnings', 'the', 'earnings', 'action', 'for', 'investing', 'investing', 'since', 'benzinga'] {'shares': 3, 'inc': 2, 'friday': 2, 'general': 3, 'motors': 3, 'for': 2, 'gm': 2, 'forecast': 3, 'com': 2, 'warren': 3, 'buffett': 2, 'invested': 2, 'billion': 2, 'since': 2, 'action': 3, 'benzinga': 2, 'the': 3, 'stocks': 2, 'zoom': 2, 'earnings': 2, 'investing': 2}

 The result ->  28.090026999999967


 see this ->  ['the', 'growth', 'gain', 'follows', 'analyst', 'move', 'upgrade', 'month', 'target', 'a', 'chart', 'showing', 'moving', 'inc', 'rose', 'tsla', 'inc', 'forecast', 'com', 'forecast', 



 see this ->  ['hours', 'ago', 'find', 'latest', 'quote', 'history', 'vital', 'information', 'help', 'trading', 'investing', 'hours', 'ago', 'find', 'latest', 'quote', 'history', 'vital', 'information', 'help', 'trading', 'investing', 'forecast', 'com', 'quote', 'forecast', 'forecast', 'falling', 'much', 'com', 'falling', 'much', 'high', 'latest', 'price', 'trading', 'general', 'motors', 'means', 'today', 'business', 'market', 'elon', 'musk', 'said', 'price', 'falling', 'high', 'general', 'motors', 'today', 'electric', 'pickup', 'electric', 'pickup', 'data', 'data', 'business', 'market', 'data', 'means', 'elon', 'musk', 'market', 'said'] {'hours': 2, 'ago': 2, 'find': 2, 'latest': 3, 'quote': 3, 'history': 2, 'vital': 2, 'information': 2, 'help': 2, 'trading': 3, 'investing': 2, 'forecast': 3, 'com': 2, 'falling': 3, 'much': 2, 'high': 2, 'price': 2, 'general': 2, 'motors': 2, 'means': 2, 'today': 2, 'business': 2, 'market': 3, 'elon': 2, 'musk': 2, 'said': 2, 'electric': 2, 'pickup'



 see this ->  ['inc', 'fell', 'monday', 'tsla', 'tsla', 'monday', 'weeks', 'almost', 'almost', 'investors', 'stocks', 'recent', 'weeks', 'market', 'warren', 'fell', 'monday', 'inc', 'market', 'stocks', 'recent', 'inc', 'tsla', 'warren', 'stocks', 'investors'] {'inc': 3, 'fell': 2, 'monday': 3, 'tsla': 3, 'weeks': 2, 'almost': 2, 'investors': 2, 'stocks': 3, 'recent': 2, 'market': 2, 'warren': 2}

 The result ->  59.880005000000104


 see this ->  ['inc', 'monday', 'tsla', 'it', 'jump', 'jumped', 'jump', 's', 'tech', 'jumped', 'almost', 'nasdaq', 'tsla', 'sharply', 'recent', 'nasdaq', 'tsla', 'sharply', 'monday', 'weeks', 'it', 'almost', 'tech', 'surge', 'wall', 'surge', 'tech', 'almost', 's', 'investors', 'recent', 'weeks', 'investors', 'inc', 'wall'] {'inc': 2, 'monday': 2, 'tsla': 3, 'it': 2, 'jump': 2, 'jumped': 2, 's': 2, 'tech': 3, 'almost': 3, 'nasdaq': 2, 'sharply': 2, 'recent': 2, 'weeks': 2, 'surge': 2, 'wall': 2, 'investors': 2}

 The result ->  -0.7000120000000152


 see t



 see this ->  ['nasdaq', 'tsla', 'took', 'hit', 'at', 'worst', 'point', 'trading', 'time', 'latest', 'electric', 'vehicles', 'latest', 'the', 'nasdaq', 'tsla', 'took', 'hit', 'at', 'worst', 'motors', 'new', 'point', 'time', 'trading', 'new', 'motors', 'electric', 'vehicles', 'the'] {'nasdaq': 2, 'tsla': 2, 'took': 2, 'hit': 2, 'at': 2, 'worst': 2, 'point': 2, 'trading': 2, 'time': 2, 'latest': 2, 'electric': 2, 'vehicles': 2, 'the': 2, 'motors': 2, 'new': 2}

 The result ->  19.97997999999995


 see this ->  ['price', 'target', 'ford', 'motors', 'target', 'price', 'ford', 'year', 'ev', 'latest', 'electric', 'vehicles', 'latest', 'boeing', 'the', 'market', 'quotes', 'business', 'financial', 'trading', 'shares', 'trading', 'monday', 'electric', 'vehicles', 'market', 'ev', 'the', 'monday', 'boeing', 'ibd', 'ibd', 'motors', 'electric', 'the', 'shares', 'year', 'price', 'target', 'business', 'financial', 'quotes', 'market'] {'price': 3, 'target': 3, 'ford': 2, 'motors': 2, 'year': 2, 'ev'



 see this ->  ['shares', 'tsla', 'trading', 'reports', 'weekend', 'fatal', 'electric', 'driving', 'a', 'said', 'driving', 'car', 'said', 'the', 'tsla', 'trading', 'market', 'reports', 'weekend', 'fatal', 'driving', 'electric', 'price', 'a', 'price', 'car', 'weekend', 'time', 'a', 'said', 'time', 'market', 'stocks', 'motors', 'the', 'electric', 'the', 'level', 'tsla', 'level', 'shares', 'stocks', 'corp', 'motors', 'corp'] {'shares': 2, 'tsla': 3, 'trading': 2, 'reports': 2, 'weekend': 3, 'fatal': 2, 'electric': 3, 'driving': 3, 'a': 3, 'said': 3, 'car': 2, 'the': 3, 'market': 2, 'price': 2, 'time': 2, 'stocks': 2, 'motors': 2, 'level': 2, 'corp': 2}

 The result ->  14.91998199999989


 see this ->  ['stocks', 'general', 'motors', 'gm', 'shares', 'week', 'three', 'week', 'three', 'investing', 'news', 'electric', 'vehicle', 'price', 'price', 'falling', 'car', 'monday', 'investing', 'news', 'falling', 'shares', 'general', 'motors', 'gm', 'electric', 'car', 'monday', 'the', 'electric', '